In [1]:
from datasets.dataset import Dataset

from Util.qm import QueryManager
import argparse
import numpy as np
import time
import pandas as pd
import multiprocessing as mp
from Util import oracle, util2, benchmarks
from tqdm import tqdm
from algorithms import dualquery, mwem, fem
import matplotlib.pyplot as plt

In [2]:
dataset = 'adult'
workload = 64
marginal = 3


######################################################
## Get dataset
######################################################
data, workloads = benchmarks.randomKway(dataset, workload, marginal)
N = data.df.shape[0]
delta=1/N**2
######################################################
## Get Queries
######################################################
stime = time.time()
query_manager = QueryManager(data.domain, workloads)
print("Number of queries = ", len(query_manager.queries))

real_ans = query_manager.get_answer(data)
epsilon_list= np.linspace(0.1, 0.5, 5)
total_runs = 5

Number of queries =  458996


In [None]:
dq_data_fun = {}
for i in range(total_runs):
    dq_data_fun[i] = dualquery.generate(real_ans, N, data.domain, query_manager, epsilon_list, delta, eta=3.36, samples=9)

In [None]:
fem_data_fun = {}
for i in range(total_runs):
    fem_data_fun[i] = fem.generate(real_ans, N, data.domain, query_manager, epsilon_list, delta, epsilon_split=0.0135, noise_multiple=0.099, samples=25)

In [3]:
fem_data_fun_50_samples = {}
for i in range(total_runs):
    fem_data_fun_50_samples[i] = fem.generate(real_ans, N, data.domain, query_manager, epsilon_list, delta, epsilon_split=0.0135, noise_multiple=0.099, samples=50)
    

  0%|          | 1/392 [00:00<00:00, 17260.51it/s, eps0=0.00135]

T = 392, noise = 1.9600999974491098


100%|██████████| 392/392 [34:11<00:00,  5.23s/it, eps0=0.00675] 
100%|██████████| 392/392 [35:37<00:00,  5.45s/it, eps0=0.00675] 
100%|██████████| 392/392 [34:48<00:00,  5.33s/it, eps0=0.00675] 
100%|██████████| 392/392 [34:33<00:00,  5.29s/it, eps0=0.00675] 
100%|██████████| 392/392 [33:08<00:00,  5.07s/it, eps0=0.00675] 


In [None]:
def get_error(data_fun, mwem_eps0=0.02):
    total_runs = len(data_fun)
    total_eps = [0.5, 1, 2]
    epsilon_split=0.02
    # mwem_error = {}

    support_error = {}
    mwem_error = {}
    mwem_epsilon_values = [0.5, 1]

    for run_id in range(total_runs):
        support_error[run_id] = []
        mwem_error[run_id] = {}
        for mwem_eps in mwem_epsilon_values:
            mwem_error[run_id][mwem_eps] = []

        for eps in epsilon_list:
            syn_data, cumulative_rho = data_fun[run_id](eps)
            fd_sz = syn_data.df.shape[0]
            q_max_error = np.abs(real_ans - query_manager.get_answer(syn_data)).max()
            support_error[run_id].append(q_max_error)

            for mwem_eps in mwem_epsilon_values:
                mwem_support, A = mwem.generate(syn_data, real_ans, N, query_manager, 
                                            epsilon=mwem_eps, delta=delta, 
                                            epsilon_split=mwem_eps0,
                                            cumulative_rho=cumulative_rho)

                mwem_max_error = np.abs(real_ans - query_manager.get_answer(mwem_support, A)).max()
                mwem_error[run_id][mwem_eps].append(mwem_max_error)
    return support_error, mwem_error


# 
def plot_error(title, support_label, mwem_label, support_error, mwem_error):
    plt.title(title)

    support_error_np = np.array([support_error[run_id] for run_id in range(total_runs)])
    support_error_max = np.max(support_error_np, axis=0)
    support_error_min = np.min(support_error_np, axis=0)
    support_error_ave = np.mean(support_error_np, axis=0)
    plt.fill_between(epsilon_list, support_error_min, support_error_max, alpha=0.1)
    plt.plot(epsilon_list, support_error_ave, '--o', label=support_label)

    for mwem_eps in mwem_epsilon_values:
        mwem_error_np = np.array([mwem_error[run_id][mwem_eps] for run_id in range(total_runs)])
        mwem_error_max = np.max(mwem_error_np, axis=0)
        mwem_error_min = np.min(mwem_error_np, axis=0)
        mwem_error_average = np.mean(mwem_error_np, axis=0)
        plt.fill_between(epsilon_list, mwem_error_min, mwem_error_max, alpha=0.1)
        plt.plot(epsilon_list, mwem_error_average, '-o', label=f'{mwem_label}, epsilon={mwem_eps}')


    plt.xticks(epsilon_list, rotation='vertical')
    plt.xlabel('epsilon budget split')
    plt.ylabel('max-error')
    plt.legend()
    plt.grid(linestyle='-')
    plt.box(on=None)
    

In [None]:
eps0_list = [0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04]

for eps0 in eps0_list:
    fem_50_error, fem_mwem_50_error = get_error(fem_data_fun_50_samples, mwem_eps0=eps0)
    plot_error('FEM+MWEM 50 samples. mwem_eps0={eps0}'.format(eps0), 'FEM alone', 'FEM+MWEM', fem_50_error, fem_mwem_50_error)
# fem_25_error, fem_mwem_25_error = get_error(fem_data_fun)

In [None]:
plot_error('FEM+MWEM 25 samples. Averaged over 5 runs', 'FEM alone', 'FEM+MWEM', fem_25_error, fem_mwem_25_error)


In [None]:
dq_error, dq_mwem_error = get_error(dq_data_fun)
plot_error('DQ+MWEM. Averaged over 5 runs', 'DQ alone', 'DQ+MWEM', dq_error, dq_mwem_error)
