# Experiments

## Imports required by all experiments

In [None]:
# module imports
import math
import pandas as pd
import numpy as np
from multiprocessing.shared_memory import SharedMemory
import os
import pickle
import itertools
import logging
from tqdm.auto import tqdm
import json
import uuid

# my imports
from exp_utils import single_exp, load_cms, load_prior, set_cms_eps, unload_cms, attack_real_users, \
    attack, run_utility_vs_eps
from cms import CMS
from plot_lib import init_plotting, init_figure, plot_prec_nr, plot_conf, plot_gammadelta, \
    parse_pickle_prec_nr, plot_gammadelta_heatmap, plot_spl_conf, colors_to_use
from exp_params import ExpParams
from base_distribution import zip_pdf, random_pdf

# initialize plotting format
init_plotting('ieee')

# prepare relevant folders
exps_folder = 'pickles/experiments/'
hash_tables_folder = 'pickles/hash_tables/'
hadamard_matrices_folder = 'pickles/hadamard_matrices/'
priors_folder = 'pickles/priors/'
imgs_folder = 'imgs/'
p_omegas_folder = 'pickles/p_omegas/'

os.makedirs(exps_folder, exist_ok=True)
os.makedirs(hash_tables_folder, exist_ok=True)
os.makedirs(hadamard_matrices_folder, exist_ok=True)
os.makedirs(priors_folder, exist_ok=True)
os.makedirs(imgs_folder, exist_ok=True)
os.makedirs(p_omegas_folder, exist_ok=True)

# initialize logging to track experiment status after exiting notebook
logging.basicConfig(filename='pickles/log.txt', filemode='a',
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%H:%M:%S', level=logging.INFO)

# number of users to use in main experiments (default: 150000)
num_users = 150000

## Attack on Emojis 

In [None]:
# emojis parameters
setting = 'emojis'
U, m, k, eps = 2600, 1024, 65536, 4
pool_sizes = [228, 228, 228, 228, 228, 228]
p_omega = f'{p_omegas_folder}/{setting}_p_omega.pickle'
prior = f'{priors_folder}/{setting}_prior.pickle'
kappa = 1.2
ns = [7, 30, 90, 180]

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{setting} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{setting} => loaded shared cms')

try:
    # generate/load zipf p_\Omega
    if not os.path.isfile(p_omega):
        universe = np.arange(U)
        p_omega_f = np.concatenate([zip_pdf(kappa, p) for p in pool_sizes] +
                                [zip_pdf(kappa, U - sum(pool_sizes))])
        p_omega_f /= p_omega_f.sum()
        pickle.dump(p_omega_f, open(p_omega, 'wb'))
    logging.info(f'{setting} => loaded p_omega')

    # generate/load prior
    load_prior(U, m, k, eps, pool_sizes, p_omega, prior)
    logging.info(f'{setting} => loaded prior ({prior})')
    logging.info(f'{setting} => finished setting up')

    for prior_type in ['estimated', 'uniform']:
        curr_prior = None if prior_type == 'uniform' else prior

        # pack experiment parameters into object
        sub_folder = f'{setting}_{prior_type}_prior'
        exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
        exp_params = ExpParams(shared_cms, pool_sizes,
            exp_db_filename=exp_db_filename, reps=num_users,
            p_omega=p_omega, prior=curr_prior)
        
        logging.info(f'{setting}, {prior_type} => starting')

        for n in ns:
            exp_params.n = n
            logging.info(f'{setting}, {prior_type}, {n} => starting')
            pbar = tqdm(total=exp_params.reps, desc=f'n = {n}: ')
            single_exp(exp_params, user_seeds=user_seeds, pbar=pbar)
            logging.info(f'{setting}, {prior_type}, {n} => finished')

            pickle.dump(exp_params.EXP_DB, open(exp_db_filename, 'wb'))

        logging.info(f'{setting}, {prior_type} => finished')
finally:
    unload_cms(shared_cms)

## run non private
np_cms = (U, U, math.inf, 0, None, None)
sub_folder = f'{setting}_non_private'
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
exp_params = ExpParams(np_cms, pool_sizes,
    exp_db_filename=exp_db_filename, reps=num_users, p_omega=p_omega)
for n in ns:
    exp_params.n = n
    logging.info(f'{setting}, non-private, {n} => starting')
    pbar = tqdm(total=exp_params.reps, desc=f'n = {n}: ')
    single_exp(exp_params, user_seeds=user_seeds, pbar=pbar)
    logging.info(f'{setting}, non-private, {n} => finished')

    pickle.dump(exp_params.EXP_DB, open(exp_db_filename, 'wb'))
logging.info(f'{setting} => finished')

In [None]:
# precision vs null rate
fig, axs = init_figure(1, 2, 'ieee_double')
setting = 'emojis'
n_pools = 6

# plot uniform prior
AUC_DB = plot_prec_nr(f'{setting}_uniform_prior', axs[0], n_pools=n_pools)
# plot non-private
AUC_DB = plot_prec_nr(f'{setting}_non_private', axs[0], n_pools=n_pools,
    AUC_DB=AUC_DB, nonprivate=True)
axs[0].set_title('$\\text{Adv}_\\textit{weak}$')

# plot estimated prior
AUC_DB = plot_prec_nr(f'{setting}_estimated_prior', axs[1], n_pools=n_pools,
    AUC_DB=AUC_DB)
# plot non-private
plot_prec_nr(f'{setting}_non_private', axs[1], n_pools=n_pools, nonprivate=True)
axs[1].set_title('$\\text{Adv}_\\textit{strong}$')

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

# custom legend
h, l = axs[-1].get_legend_handles_labels()
handles = h[1:5] + h[:1] + h[6:]
labels = l[1:5] + l[:1] + l[6:]
axs[-1].legend(handles, labels, bbox_to_anchor=(1.1, 1.0375), ncol=2,
    columnspacing=0.5, title='\\hphantom{\\text{.}} \\hspace{0.2cm} '
    '\\textbf{CMS} \\hspace{0.8cm} \\textbf{Non-Private}')

fig.tight_layout(w_pad=0)
AUC_DB.to_pickle(f'imgs/{setting}.pkl')
fig.savefig(f'imgs/{setting}_prec_mr.pdf', bbox_inches='tight')

In [None]:
# conf score
setting = 'emojis'

fig, axs = init_figure(1, 2, 'ieee')
plot_conf(f'{setting}_uniform_prior', axs[0])
axs[0].set_title('$\\text{Adv}_\\textit{weak}$')

plot_conf(f'{setting}_estimated_prior', axs[1])
axs[1].set_title('$\\text{Adv}_\\textit{strong}$')

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

# legend
axs[-1].legend(bbox_to_anchor=(1.1, 1.0375))

fig.savefig(f'imgs/{setting}_conf.pdf', bbox_inches='tight')

In [None]:
# contour (uniform)
setting = 'emojis'
prior_type = 'uniform'
exp_name = f'{setting}_{prior_type}_prior'
n_pools = 6

fig, axs = init_figure(1, 4, 'ieee_quadruple')
plot_gammadelta(exp_name, axs, n_pools)

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

fig.savefig(f'imgs/{exp_name}_cont.pdf', bbox_inches='tight')

In [None]:
# contour (estimated)
setting = 'emojis'
prior_type = 'estimated'
exp_name = f'{setting}_{prior_type}_prior'
n_pools = 6

fig, axs = init_figure(1, 4, 'ieee_quadruple')
plot_gammadelta(exp_name, axs, n_pools)

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

fig.savefig(f'imgs/{exp_name}_cont.pdf', bbox_inches='tight')

## Attack on News/Web Domains

In [None]:
# web domains parameters
setting = 'news'
U, m, k, eps = 2000, 1024, 65536, 8
pool_sizes = [14, 13, 13, 10, 10]
p_omega = f'{p_omegas_folder}/{setting}_p_omega.pickle'
prior = f'{priors_folder}/{setting}_prior.pickle'
ns = [7, 30, 90, 180]

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{setting} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{setting} => loaded shared cms')

try:
    # generate/load random p_\Omega
    if not os.path.isfile(p_omega):
        universe = np.arange(U)
        p_omega_f = np.concatenate([random_pdf(p) for p in pool_sizes] +
                                   [random_pdf(U - sum(pool_sizes))])
        p_omega_f /= p_omega_f.sum()

        pickle.dump(p_omega_f, open(p_omega, 'wb'))
    logging.info(f'{setting} => loaded p_omega')

    # generate/load prior
    load_prior(U, m, k, eps, pool_sizes, p_omega, prior)
    logging.info(f'{setting} => loaded prior')
    logging.info(f'{setting} => finished setting up')

    for prior_type in ['estimated', 'uniform']:
        curr_prior = None if prior_type == 'uniform' else prior

        # pack experiment parameters into object
        sub_folder = f'{setting}_{prior_type}_prior'
        exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
        exp_params = ExpParams(shared_cms, pool_sizes,
            exp_db_filename=exp_db_filename, reps=num_users,
            p_omega=p_omega, prior=curr_prior)
        
        logging.info(f'{setting}, {prior_type} => starting')

        for n in ns:
            exp_params.n = n
            logging.info(f'{setting}, {prior_type}, {n} => starting')
            pbar = tqdm(total=exp_params.reps, desc=f'n = {n}: ')
            single_exp(exp_params, user_seeds=user_seeds, pbar=pbar)
            logging.info(f'{setting}, {prior_type}, {n} => finished')

            pickle.dump(exp_params.EXP_DB, open(exp_db_filename, 'wb'))

        logging.info(f'{setting}, {prior_type} => finished')
finally:
    unload_cms(shared_cms)

## run non private
np_cms = (U, U, math.inf, 0, None, None)
sub_folder = f'{setting}_non_private'
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
exp_params = ExpParams(np_cms, pool_sizes,
    exp_db_filename=exp_db_filename, reps=num_users, p_omega=p_omega)
for n in ns:
    exp_params.n = n
    logging.info(f'{setting}, non-private, {n} => starting')
    pbar = tqdm(total=exp_params.reps, desc=f'n = {n}: ')
    single_exp(exp_params, user_seeds=user_seeds, pbar=pbar)
    logging.info(f'{setting}, non-private, {n} => finished')

    pickle.dump(exp_params.EXP_DB, open(exp_db_filename, 'wb'))
logging.info(f'{setting} => finished')

In [None]:
# precision vs null rate
fig, axs = init_figure(1, 2, 'ieee_double')
setting = 'news'
n_pools = 5

# plot uniform prior
AUC_DB = plot_prec_nr(f'{setting}_uniform_prior', axs[0], n_pools=n_pools)
# plot non-private
AUC_DB = plot_prec_nr(f'{setting}_non_private', axs[0], n_pools=n_pools,
    AUC_DB=AUC_DB, nonprivate=True)
# plot estimated prior
AUC_DB = plot_prec_nr(f'{setting}_estimated_prior', axs[1], n_pools=n_pools,
    AUC_DB=AUC_DB)
# plot non-private
plot_prec_nr(f'{setting}_non_private', axs[1], n_pools=n_pools, nonprivate=True)

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

# custom legend
h, l = axs[-1].get_legend_handles_labels()
handles = h[1:5] + h[:1] + h[6:]
labels = l[1:5] + l[:1] + l[6:]
axs[-1].legend(handles, labels, bbox_to_anchor=(1.1, 1.0375), ncol=2,
    columnspacing=0.5, title='\\hphantom{\\text{.}} \\hspace{0.2cm} '
    '\\textbf{CMS} \\hspace{0.8cm} \\textbf{Non-Private}')

fig.tight_layout()
AUC_DB.to_pickle(f'imgs/{setting}.pkl')
fig.savefig(f'imgs/{setting}_prec_mr.pdf', bbox_inches='tight')

In [None]:
# conf score
setting = 'news'

fig, axs = init_figure(1, 2, 'ieee')
plot_conf(f'{setting}_uniform_prior', axs[0])
axs[0].set_title('$\\text{Adv}_\\textit{weak}$')

plot_conf(f'{setting}_estimated_prior', axs[1])
axs[1].set_title('$\\text{Adv}_\\textit{strong}$')

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

# legend
axs[-1].legend(bbox_to_anchor=(1.1, 1.0375))

fig.savefig(f'imgs/{setting}_conf.pdf', bbox_inches='tight')

In [None]:
# contour (uniform)
setting = 'news'
prior_type = 'uniform'
exp_name = f'{setting}_{prior_type}_prior'
n_pools = 5

fig, axs = init_figure(1, 4, 'ieee_quadruple')
plot_gammadelta(exp_name, axs, n_pools)

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

fig.savefig(f'imgs/{exp_name}_cont.pdf', bbox_inches='tight')

In [None]:
# contour (estimated)
setting = 'news'
prior_type = 'estimated'
exp_name = f'{setting}_{prior_type}_prior'
n_pools = 5

fig, axs = init_figure(1, 4, 'ieee_quadruple')
plot_gammadelta(exp_name, axs, n_pools)

# remove unnecessary labels and ticks
for ax in axs.flat:
    ax.label_outer()

fig.savefig(f'imgs/{exp_name}_cont.pdf', bbox_inches='tight')

## All confidence score plots

In [None]:
fig, axs = init_figure(1, 4, 'ieee_quadruple')
ax_id = 0
for setting in ['emojis', 'news']:
    for prior_type, prior_name in zip(['uniform', 'estimated'],
                                      ['$\\text{Adv}_\\textit{weak}$', '$\\text{Adv}_\\textit{strong}$']):
        ax = axs[ax_id]
        plot_conf(f'{setting}_{prior_type}_prior', ax=ax)
        ax_id += 1
        
        if ax_id != 1:
            ax.set_ylabel('')
            ax.set_yticklabels([])
        
        setting_str = 'Emojis' if setting == 'emojis' else 'Web domains'
        ax.set_title(f'{setting_str}, {prior_name}')
axs[-1].legend(bbox_to_anchor=(1.1, 1.0375))
fig.tight_layout()
fig.savefig('imgs/all_conf.pdf', bbox_inches='tight')

## Attack on Twitter data

In [None]:
# twitter data parameters
setting = 'twitter'
U, m, k, eps = 3239, 1024, 65536, 4
pool_sizes = [273, 222, 263, 267, 184, 395]
prior = f'{priors_folder}/{setting}_prior.pickle'
ns = [7, 30, 90, 180]
# place pre-processed attack dataset and external dataset into this location
users_file = f'pickles/{setting}_att.json'
ref_file = f'pickles/{setting}_ext.pickle'

if not os.path.isfile(users_file):
    raise Exception(f'Dataset not found at {users_file}')
if not os.path.isfile(ref_file):
    raise Excepion(f'Auxiliary dataset not found at {ref_file}')

# load user ids
with open(users_file, 'r') as f:
    users_data = json.load(f)
user_ids = users_data.keys()

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=len(user_ids))

logging.info(f'{setting} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{setting} => loaded shared cms')

try:
    # generate/load prior
    ref_emojis = np.array(pickle.load(open(ref_file, 'rb')))
    load_prior(U, m, k, eps, pool_sizes, prior_filename=prior, objects=ref_emojis)
    logging.info(f'{setting} => loaded prior')
    logging.info(f'{setting} => finished setting up')

    # only run experiment on estimated prior
    prior_type = 'estimated'
    curr_prior = None if prior_type == 'uniform' else prior

    # initialize DB to write result to
    sub_folder = f'{setting}_{prior_type}_prior'
    exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
    os.makedirs(f'{exps_folder}/{sub_folder}/exp_pickles', exist_ok=True)
    EXP_DB = pd.DataFrame(columns=['unique_id', 'n', 'k', 'acc'])

    logging.info(f'{setting} => starting')

    for n in ns:
        unique_id = uuid.uuid4().hex[:5]
        logging.info(f'{setting}, {n} => starting')
        pbar = tqdm(total=len(user_ids), desc=f'n = {n}: ')
        acc, results = attack_real_users(n, user_ids, users_file,
            user_seeds, shared_cms, pool_sizes, prior, pbar=pbar)
        logging.info(f'{setting}, {n} => finished')

        EXP_DB = EXP_DB.append({'unique_id': unique_id, 'n': n, 'k': k, 'acc': acc}, ignore_index=True)
        pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
        pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))

    logging.info(f'{setting} => finished')
finally:
    unload_cms(shared_cms)

## run non private
np_cms = (U, U, math.inf, 0, None, None)
sub_folder = f'{setting}_non_private'
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
os.makedirs(f'{exps_folder}/{sub_folder}/exp_pickles', exist_ok=True)
EXP_DB = pd.DataFrame(columns=['unique_id', 'n', 'k', 'acc'])
for n in ns:
    unique_id = uuid.uuid4().hex[:5]
    logging.info(f'{setting}, non-private, {n} => starting')
    pbar = tqdm(total=len(user_ids), desc=f'n = {n}: ')
    acc, results = attack_real_users(n, user_ids, users_file,
        user_seeds, np_cms, pool_sizes, prior, pbar=pbar)
    logging.info(f'{setting}, non-private, {n} => finished')

    EXP_DB = EXP_DB.append({'unique_id': unique_id, 'n': n, 'k': 0, 'acc': acc}, ignore_index=True)
    pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
    pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))
logging.info(f'{setting} => finished')

In [None]:
## plot prec-nr and conf score on same plot
fig, axs = init_figure(1, 2, 'ieee_double')
ns = [7, 30, 90, 180]
n_pools = 6
setting = 'twitter'

## prec-nr
ax = axs[0]
# plot estimated prior
AUC_DB = plot_prec_nr(f'{setting}_estimated_prior', ax, n_pools=n_pools)
# plot non-private
AUC_DB = plot_prec_nr(f'{setting}_non_private', ax, n_pools=n_pools,
    AUC_DB=AUC_DB, nonprivate=True)
AUC_DB.to_pickle(f'imgs/{setting}.pkl')

# custom legend
h, l = ax.get_legend_handles_labels()
handles = h[1:5] + [h[0]] + h[5:]
labels = l[1:5] + [l[0]] + l[5:]
ax.legend(handles, labels, bbox_to_anchor=(1, 1.0375), ncol=2, columnspacing=0.5,
          title='\\hphantom{\\text{.}} \\hspace{0.2cm} \\textbf{Private} \\hspace{0.8cm} \\textbf{Non-Private}')

# conf score
ax = axs[1]
ns = [7, 30, 90, 180]
EXP_DB = pickle.load(open(f'{exps_folder}/{setting}_estimated_prior/EXP_DB.pickle', 'rb'))
EXP_DB = EXP_DB[EXP_DB['k'] != 0]
for (i, n) in enumerate(ns):
    unique_id = EXP_DB[EXP_DB['n'] == n]['unique_id'].iloc[0]
    plot_spl_conf(f'{exps_folder}/{setting}_estimated_prior/exp_pickles/{unique_id}.pickle',
                  ax, colors_to_use[len(ns)][i], label=f'$n = {n}$')
ax.legend(bbox_to_anchor=(1, 1.025))
ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')

fig.tight_layout()
fig.subplots_adjust(wspace=1.8)
fig.savefig(f'imgs/{setting}_prec_nr_conf.pdf', bbox_inches='tight')

In [None]:
## heatmap of precision
setting = 'twitter'
prior_type = 'estimated'

fig, axs = init_figure(1, 4, 'ieee_quadruple')
folder = f'{exps_folder}/{setting}_{prior_type}_prior'
ns = [7, 30, 90, 180]
EXP_DB = pickle.load(open(f'{folder}/EXP_DB.pickle', 'rb'))
EXP_DB = EXP_DB[EXP_DB['k'] != 0]
for (i, n) in enumerate(ns):
    unique_id = EXP_DB[EXP_DB['n'] == n]['unique_id'].iloc[0]
    plot_gammadelta_heatmap(f'{folder}/exp_pickles/{unique_id}.pickle', axs[i])
    axs[i].set_title(f'$n = {n}$')

for ax in axs:
    ax.label_outer()
fig.tight_layout(w_pad=3)
fig.savefig(f'imgs/{setting}_{prior_type}_prior_cont.pdf', bbox_inches='tight')

In [None]:
## heatmap of distribution
setting = 'twitter'
prior_type = 'estimated'

fig, ax = init_figure(1, 1, 'ieee')
folder = f'{exps_folder}/{setting}_{prior_type}_prior/'
n = 30
EXP_DB = pickle.load(open(f'{folder}/EXP_DB.pickle', 'rb'))
EXP_DB = EXP_DB[EXP_DB['k'] != 0]
unique_id = EXP_DB[EXP_DB['n'] == n]['unique_id'].iloc[0]
plot_gammadelta_heatmap(f'{folder}/exp_pickles/{unique_id}.pickle', ax, plot='dist')
fig.tight_layout()
fig.savefig(f'imgs/{setting}_{prior_type}_prior_dist.pdf', bbox_inches='tight')

# Appendices

## Effect of entropy

In [None]:
# emojis parameters
setting = 'emojis'
U, m, k, eps = 2600, 1024, 65536, 4
n = 180
Ps = [10, 50, 200, 400]
kappas = [0, 0.5, 1, 2, 4]
EXP_DB = pd.DataFrame(columns=['unique_id', 'P', 's', 'entropy', 'auc'])
sub_folder = f'{setting}_entropy'
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'

# number of users can be smaller because we don't plot gamma, delta
num_users = 5000

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{sub_folder} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{sub_folder} => loaded shared cms')

try:
    for P in Ps:
        logging.info(f'{sub_folder}, {P} => starting')
        for kappa in kappas:
            unique_id = uuid.uuid4().hex[:5]
            pool_sizes = [P for _ in range(6)]
            p_omega = f'{p_omegas_folder}/{setting}_p_omega_{P}_{kappa}.pickle'

            # generate/load zipf p_\Omega
            if not os.path.isfile(p_omega):
                universe = np.arange(U)
                p_omega_f = np.concatenate([zip_pdf(kappa, p) for p in pool_sizes] +
                                           [zip_pdf(kappa, U - sum(pool_sizes))])
                p_omega_f /= p_omega_f.sum()
                pickle.dump(p_omega_f, open(p_omega, 'wb'))
            logging.info(f'{sub_folder}, {P}, {kappa} => loaded p_omega')

            # prior is same as p_\Omega in this setting
            prior = p_omega
            logging.info(f'{sub_folder}, {P}, {kappa} => finished setting up')

            # only run experiment with estimated prior
            prior_type = 'estimated'
            curr_prior = None if prior_type == 'uniform' else prior

            # pack experiment parameters into object
            exp_params = ExpParams(shared_cms, pool_sizes,
                exp_db_filename=exp_db_filename, reps=num_users,
                p_omega=p_omega, prior=curr_prior, n=n, EXP_DB=EXP_DB)

            logging.info(f'{sub_folder}, {P}, {kappa} => starting')

            pbar = tqdm(total=exp_params.reps, desc=f'P = {P}, kappa = {kappa}: ')
            acc, results = attack(exp_params, user_seeds=user_seeds, pbar=pbar)
            logging.info(f'{sub_folder}, {prior_type}, {n} => finished')
            
            # get auc
            _, _, curr_auc = parse_pickle_prec_nr(results=results)
            # calculate entropy
            pool_probs = zip_pdf(kappa, P)
            curr_entropy = -np.sum(pool_probs * np.log(pool_probs))
            EXP_DB = EXP_DB.append({
                'unique_id': unique_id, 'P': P, 's': kappa, 'entropy': curr_entropy, 'auc': curr_auc
            }, ignore_index=True)

            pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
            pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))
            logging.info(f'{sub_folder}, {P}, {kappa} => finished')
        logging.info(f'{sub_folder}, {P} => finished')
finally:
    unload_cms(shared_cms)

# write a copy of result database to images folder to commit it to git
pickle.dump(EXP_DB, open(f'imgs/{sub_folder}.pkl', 'wb'))
logging.info(f'{sub_folder} => finished')

## Robustness of attack

In [None]:
## pre-processing before attack
# emojis parameters
setting = 'emojis'
U, m, k, eps = 2600, 1024, 65536, 4
pool_sizes = [228, 228, 228, 228, 228, 228]
p_omega = f'{p_omegas_folder}/{setting}_p_omega.pickle'
ns = [7, 30, 90, 180]
sigmas = [0, 1e-5, 1e-4, 1e-3, 1e-2]
sigma_strs = ['0', '1e-5', '1e-4', '1e-3', '1e-2']

# load p_\Omega
p_omega_f = pickle.load(open(p_omega, 'rb'))

# calculate average JSD for each sigma
def kld(p, q):
    res = 0
    for (p_x, q_x) in zip(p, q):
        if p_x != 0:
            res += p_x * np.log2(p_x / q_x)
    return res

def jsd(p, q):
    m = (p + q) / 2
    return (kld(p, m) + kld(q, m)) / 2

# number of users can be smaller because we don't plot gamma, delta
reps = 5000

# generate/load prior
prior = f'{priors_folder}/{setting}_prior.pickle'
load_prior(U, m, k, eps, pool_sizes, p_omega, prior)
prior_f = pickle.load(open(prior, 'rb'))

sigma_jsd = dict()
for (sigma, sigma_str) in tqdm(zip(sigmas, sigma_strs)):
    avg_jsd = 0
    jsds = np.zeros(reps)
    for rep in range(reps):
        # generate a valid p_usr
        p_usr = p_omega_f + np.random.normal(scale=sigma, size=p_omega_f.shape)
        p_usr += np.abs(p_usr.min())
        p_usr /= p_usr.sum()

        jsds[rep] = jsd(prior_f, p_usr)
    sigma_jsd[sigma] = jsds.mean()

In [None]:
## actual attack
# emojis parameters
setting = 'emojis'
U, m, k, eps = 2600, 1024, 65536, 4
pool_sizes = [228, 228, 228, 228, 228, 228]
p_omega = f'{p_omegas_folder}/{setting}_p_omega.pickle'
prior = f'{priors_folder}/{setting}_prior.pickle'
ns = [7, 30, 90, 180]
sigmas = [0, 1e-5, 1e-4, 1e-3, 1e-2]
sigma_strs = ['0', '1e-5', '1e-4', '1e-3', '1e-2']

sub_folder = f'{setting}_robustness'
EXP_DB = pd.DataFrame(columns=['unique_id', 'sigma', 'n', 'avgD', 'auc'])
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'

# number of users can be smaller because we don't plot gamma, delta
num_users = 5000

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{sub_folder} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{sub_folder} => loaded shared cms')

try:
    # generate/load zipf p_\Omega
    if not os.path.isfile(p_omega):
        universe = np.arange(U)
        p_omega_f = np.concatenate([zip_pdf(kappa, p) for p in pool_sizes] +
                                   [zip_pdf(kappa, U - sum(pool_sizes))])
        p_omega_f /= p_omega_f.sum()
        pickle.dump(p_omega_f, open(p_omega, 'wb'))

    # generate/load prior
    load_prior(U, m, k, eps, pool_sizes, p_omega, prior)
    logging.info(f'{sub_folder} => loaded prior')
    logging.info(f'{sub_folder} => finished setting up')

    for (sigma, sigma_str) in zip(sigmas, sigma_strs):
        logging.info(f'{sub_folder}, {sigma_str} => starting')
        
        # only run experiment with estimated prior
        prior_type = 'estimated'
        curr_prior = None if prior_type == 'uniform' else prior

        # pack experiment parameters into object
        exp_params = ExpParams(shared_cms, pool_sizes,
            exp_db_filename=exp_db_filename, reps=num_users,
            p_omega=p_omega, prior=curr_prior, sigma=sigma, EXP_DB=EXP_DB)

        for n in ns:
            unique_id = uuid.uuid4().hex[:5]
            exp_params.n = n
            logging.info(f'{sub_folder}, {sigma_str}, {n} => starting')
            pbar = tqdm(total=exp_params.reps, desc=f'sigma = {sigma_str}, n = {n}: ')
            acc, results = attack(exp_params, user_seeds=user_seeds, pbar=pbar)
            logging.info(f'{sub_folder}, {sigma_str}, {n} => finished')
            
            # get auc
            _, _, curr_auc = parse_pickle_prec_nr(results=results)
            # get avgD
            curr_avgD = sigma_jsd[sigma]
            EXP_DB = EXP_DB.append({
                'unique_id': unique_id, 'sigma': sigma, 'n': n, 'avgD': curr_avgD, 'auc': curr_auc
            }, ignore_index=True)

            pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
            pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))

            logging.info(f'{sub_folder}, {sigma_str}, {n} => finished')
        logging.info(f'{sub_folder}, {sigma_str} => finished')
finally:
    unload_cms(shared_cms)

# write a copy of result database to images folder to commit it to git
pickle.dump(EXP_DB, open(f'imgs/{sub_folder}.pkl', 'wb'))
logging.info(f'{sub_folder} => finished')

## Size of the universe

In [None]:
# web domains parameters
setting = 'news'
m, k, eps = 1024, 65536, 8
ns = [7, 30, 90, 180]
Us = [1000, 10000, 100000, 250000]
pool_sizes = [14, 13, 13, 10, 10]
EXP_DB = pd.DataFrame(columns=['unique_id', 'U', 'n', 'auc'])
sub_folder = f'{setting}_size_of_universe'
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'

# number of users can be smaller because we don't plot gamma, delta
num_users = 5000

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{sub_folder} => starting')

for U in Us:
    try:
        logging.info(f'{sub_folder}, {U} => starting')
        # load cms into shared memory (don't save to file)
        shared_cms = load_cms(U, m, k, eps=eps, shared=True, save_to_file=False)
        logging.info(f'{sub_folder}, {U} => loaded shared cms')
        
        for n in ns:
            unique_id = uuid.uuid4().hex[:5]
            p_omega = f'{p_omegas_folder}/{setting}_p_omega_{U}.pickle'

            # generate/load random p_\Omega
            if not os.path.isfile(p_omega):
                universe = np.arange(U)
                p_omega_f = np.concatenate([random_pdf(p) for p in pool_sizes] +
                                           [random_pdf(U - sum(pool_sizes))])
                p_omega_f /= p_omega_f.sum()

                pickle.dump(p_omega_f, open(p_omega, 'wb'))
            logging.info(f'{sub_folder}, {U}, {n} => loaded p_omega')
            logging.info(f'{sub_folder}, {U}, {n} => finished setting up')

            # only run experiment with uniform prior
            prior_type = 'uniform'
            curr_prior = None if prior_type == 'uniform' else prior

            # pack experiment parameters into object
            exp_params = ExpParams(shared_cms, pool_sizes,
                exp_db_filename=exp_db_filename, reps=num_users,
                p_omega=p_omega, prior=curr_prior, n=n, EXP_DB=EXP_DB)

            logging.info(f'{sub_folder}, {U}, {n} => starting')

            pbar = tqdm(total=exp_params.reps, desc=f'U = {U}, n = {n}: ')
            acc, results = attack(exp_params, user_seeds=user_seeds, pbar=pbar)
            logging.info(f'{sub_folder}, {U}, {n} => finished')
            
            # get auc
            _, _, curr_auc = parse_pickle_prec_nr(results=results)
            EXP_DB = EXP_DB.append({
                'unique_id': unique_id, 'U': U, 'n': n, 'auc': curr_auc
            }, ignore_index=True)

            pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
            pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))
            logging.info(f'{sub_folder}, {U}, {n} => finished')
        logging.info(f'{sub_folder}, {U} => finished')
    finally:
        unload_cms(shared_cms)

# write a copy of result database to images folder to commit it to git
pickle.dump(EXP_DB, open(f'imgs/{sub_folder}.pkl', 'wb'))
logging.info(f'{sub_folder} => finished')

## Effect of $\varepsilon$ on AUC

In [None]:
# news parameters
setting = 'news'
U, m, k = 2000, 1024, 65536
pool_sizes = [14, 13, 13, 10, 10]
p_omega = f'{p_omegas_folder}/{setting}_p_omega.pickle'
prior = f'{priors_folder}/{setting}_prior.pickle'
ns = [7, 30, 90, 180]
epses = [0.01, 0.1, 1, 4, 8]

sub_folder = f'{setting}_eps'
EXP_DB = pd.DataFrame(columns=['unique_id', 'eps', 'n', 'auc'])
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'

# number of users can be smaller because we don't plot gamma, delta
num_users = 5000

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{sub_folder} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{sub_folder} => loaded shared cms')

try:
    # generate/load random p_\Omega
    if not os.path.isfile(p_omega):
        universe = np.arange(U)
        p_omega_f = np.concatenate([random_pdf(p) for p in pool_sizes] +
                                   [random_pdf(U - sum(pool_sizes))])
        p_omega_f /= p_omega_f.sum()

    logging.info(f'{sub_folder} => finished setting up')

    for eps in epses:
        logging.info(f'{sub_folder}, {sigma_str} => starting')
        
        # only run experiment with uniform prior
        prior_type = 'uniform'
        curr_prior = None if prior_type == 'uniform' else prior

        # pack experiment parameters into object
        shared_cms = set_cms_eps(shared_cms, eps)
        exp_params = ExpParams(shared_cms, pool_sizes,
            exp_db_filename=exp_db_filename, reps=num_users,
            p_omega=p_omega, prior=curr_prior, sigma=sigma, EXP_DB=EXP_DB)

        for n in ns:
            unique_id = uuid.uuid4().hex[:5]
            exp_params.n = n
            logging.info(f'{sub_folder}, {eps}, {n} => starting')
            pbar = tqdm(total=exp_params.reps, desc=f'eps = {eps}, n = {n}: ')
            acc, results = attack(exp_params, user_seeds=user_seeds, pbar=pbar)
            logging.info(f'{sub_folder}, {eps}, {n} => finished')
            
            # get auc
            _, _, curr_auc = parse_pickle_prec_nr(results=results)
            EXP_DB = EXP_DB.append({
                'unique_id': unique_id, 'eps': eps, 'n': n, 'auc': curr_auc
            }, ignore_index=True)

            pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
            pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))

            logging.info(f'{sub_folder}, {eps}, {n} => finished')
        logging.info(f'{sub_folder}, {eps} => finished')
finally:
    unload_cms(shared_cms)

# write a copy of result database to images folder to commit it to git
pickle.dump(EXP_DB, open(f'imgs/{sub_folder}.pkl', 'wb'))
logging.info(f'{sub_folder} => finished')

## Effect of $\varepsilon$ on Utility

In [None]:
setting = 'news'
U, m, k = 2000, 1024, 65536
epses = [0.01, 0.1, 1, 4, 8]
ns = [10**6, 10**7, 10**8, 10**9]
max_workers = 38

sub_folder = f'{setting}_utility_vs_eps'
EXP_DB = pd.DataFrame(columns=['eps', 'n', 'mae', 'mape_80'])
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'
os.makedirs(f'{exps_folder}/{sub_folder}/exp_pickles/', exist_ok=True)

logging.info(f'{sub_folder} => starting')

# Share sketch matrix on shared_memory to conserve space
M = np.zeros((k, m))
shm_Ms, shared_Ms = [], []
for _ in tqdm(range(max_workers)):
    shm_M = SharedMemory(create = True, size = M.nbytes)
    shared_M = np.ndarray(shape=M.shape, dtype=M.dtype, buffer=shm_M.buf)
    shared_M[:] = M[:]

    shm_Ms.append(shm_M)
    shared_Ms.append(shared_M)
logging.info(f'{sub_folder} => allocated memory for sketch matrices in shared memory')
logging.info(f'{sub_folder} => finished setting up')

# load p_\Omega
p_omega_f = f'{p_omegas_folder}/{setting}_p_omega.pickle'
p_omega = pickle.load(open(p_omega_f, 'rb'))

# choose top X% or i objects
sort_inds = np.flip(np.argsort(p_omega))
X = 0.8
x = int(X * len(p_omega))
top_x_p_omega = p_omega[sort_inds[:x]]

try:
    for n in ns:
        logging.info(f'{sub_folder}, {n} => starting')
        for (i, eps) in enumerate(epses):
            logging.info(f'{sub_folder}, {n}, {eps} => starting')
            freqs = run_utility_vs_eps(n, U, m, k, eps, p_omega_f, shm_Ms, shared_Ms, max_workers)
            freqs = np.array(freqs) / n
            
            curr_mae = np.mean(np.abs(freqs - p_omega))
            top_x_freqs = freqs[sort_inds[:x]]
            curr_mape_80 = np.mean(np.abs(top_x_freqs - top_x_p_omega) / top_x_p_omega) * 100
            
            EXP_DB = EXP_DB.append({
                'eps': eps, 'n': n, 'mae': curr_mae, 'mape_80': curr_mape_80
            }, ignore_index=True)
            pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
            with open(f'{exps_folder}/{sub_folder}/exp_pickles/{n}_{eps}.npy', 'wb') as result_f:
                np.save(result_f, freqs)

            logging.info(f'{sub_folder}, {n}, {eps} => finished')
        logging.info(f'{sub_folder}, {n} => finished')
finally:
    # clear shared memory
    for shm_M in shm_Ms:
        shm_M.close()
        shm_M.unlink()

## Effect of hashing

In [None]:
# web domains parameters
setting = 'news'
U, m, k, eps = 2000, 2000, 65536, 8
pool_sizes = [14, 13, 13, 10, 10]
p_omega = f'{p_omegas_folder}/{setting}_p_omega.pickle'
prior = f'{priors_folder}/{setting}_prior_{m}.pickle'
ns = [7, 30, 90, 180]

sub_folder = f'{setting}_hashing'
EXP_DB = pd.DataFrame(columns=['unique_id', 'prior_type', 'n', 'auc'])
exp_db_filename = f'{exps_folder}/{sub_folder}/EXP_DB.pickle'

# number of users can be smaller because we don't plot gamma, delta
num_users = 5000

# user metadata
user_seeds = np.random.randint(0, 2147483647, size=num_users)

logging.info(f'{setting} => starting')

# load cms into shared memory
shared_cms = load_cms(U, m, k, eps=eps, shared=True)
logging.info(f'{setting} => loaded shared cms')

# generate/load random p_\Omega
if not os.path.isfile(p_omega):
    universe = np.arange(U)
    p_omega_f = np.concatenate([random_pdf(p) for p in pool_sizes] +
                               [random_pdf(U - sum(pool_sizes))])
    p_omega_f /= p_omega_f.sum()

    pickle.dump(p_omega_f, open(p_omega, 'wb'))
logging.info(f'{setting} => loaded p_omega')

# generate/load prior
load_prior(U, m, k, eps, pool_sizes, p_omega, prior)
logging.info(f'{setting} => loaded prior')
logging.info(f'{setting} => finished setting up')

for prior_type in ['estimated', 'uniform']:
    curr_prior = None if prior_type == 'uniform' else prior

    # pack experiment parameters into object
    exp_params = ExpParams(shared_cms, pool_sizes,
        exp_db_filename=exp_db_filename, reps=num_users,
        p_omega=p_omega, prior=curr_prior, EXP_DB=EXP_DB)

    logging.info(f'{setting}, {prior_type} => starting')

    for n in ns:
        unique_id = uuid.uuid4().hex[:5]
        exp_params.n = n
        logging.info(f'{setting}, {prior_type}, {n} => starting')
        pbar = tqdm(total=num_users, desc=f'n = {n}: ')
        acc, results = attack(exp_params, pbar=pbar)
        logging.info(f'{setting}, {prior_type}, {n} => finished')

        # get auc
        _, _, curr_auc = parse_pickle_prec_nr(results=results)
        EXP_DB = EXP_DB.append({'unique_id': unique_id, 'prior_type': prior_type, 'n': n, 'auc': curr_auc},
                               ignore_index=True)
        pickle.dump(EXP_DB, open(exp_db_filename, 'wb'))
        pickle.dump(results, open(f'{exps_folder}/{sub_folder}/exp_pickles/{unique_id}.pickle', 'wb'))

    logging.info(f'{setting}, {prior_type} => finished')

logging.info(f'{setting} => finished')

## Emojis PDF

In [None]:
fig, ax = init_figure(1, 1, 'ieee_triple')
p_omega = f'{p_omegas_folder}/emojis_p_omega.pickle'
p_omega = pickle.load(open(p_omega, 'rb'))

xs = np.arange(len(p_omega))
ax.plot(xs, p_omega, color=colors_to_use[1][0], linewidth=1)

ax.set_xlabel('Object')
ax.set_ylabel('Probability mass')
ax.set_xlim(0, 2599)
ax.set_xticks([0, 500, 1000, 1500, 2000, 2500, 2599])
fig.savefig('imgs/zipf_mixture.pdf', bbox_inches='tight')