In [1]:
from saibot import *

In [2]:
# Use Saibot to privatize union
# DP constants
B = 5
eps = 1
delta = 1e-6

# data parameters
num_att = 3
num_rows = 1000

# data search parameters
num_users = 2
requests = 1
ridge = 0

result = []
result.append(int(num_rows/2))
num_rows *= 2

columns = [f"f{i}" for i in range(num_att)]
X = columns[1:]
y = columns[0]

# create datasets
mean, cov = get_pos_def_mean_cov(num_att)

df = generate_dataframe(mean, cov, num_rows, columns, B)
sensitivity = union_sensitivity(B)

# compute the non privatized estimator
non_privatized_data = agg_dataset()
non_privatized_data.load(df.copy(deep=True), columns,[])
non_privatized_data.compute_agg()

non_privatized_cov = non_privatized_data.agg/num_rows
non_privatized_parameter = ridge_linear_regression(non_privatized_data.agg, X, y, ridge)

noise = compute_std(eps, delta, sensitivity)

# compute the fdp estimator
buyeragg = agg_dataset()
buyeragg.load(df[:int(num_rows/2)].copy(deep=True), columns, [])
buyeragg.compute_agg()

selleragg = agg_dataset()
selleragg.load(df[int(num_rows/2):].copy(deep=True), columns, [])
selleragg.compute_agg()

# impute noise
for col in selleragg.semi_ring_columns():
    if col != "cov:c":
        selleragg.agg[col] += np.random.normal(0, noise, 1)

for col in buyeragg.semi_ring_columns():
    if col != "cov:c":
        buyeragg.agg[col] += np.random.normal(0, noise, 1)


fdp_cov = selleragg.agg/num_rows + buyeragg.agg/num_rows
print("s error", get_l2_distance(fdp_cov, non_privatized_cov))

if not sanity_check(fdp_cov, X, ridge):
    print("fail")
else:
    fdp_parameter = ridge_linear_regression(fdp_cov, X, y, ridge)
    print("beta error", get_l2_distance(fdp_parameter, non_privatized_parameter))

s error 0.3901811715524032
beta error 0.11898090084670562


In [3]:
# Use Saibot to privatize join
# DP constants
B = 5
eps = 1
delta = 1e-6

# data parameters
buyer_att = 2
seller_att = 1
num_group = 100
num_rows_per_group = 100

# data search parameters
num_users = 2
requests=1
ridge = 10

num_att = buyer_att + seller_att
num_rows = num_group * num_rows_per_group

columns = [f"f{i}" for i in range(num_att)]
X = columns[1:]
y = columns[0]
buyer_columns = columns[:buyer_att]
seller_columns = columns[buyer_att:]

# create data
mean, cov = get_pos_def_mean_cov(num_att)
df = generate_dataframe(mean, cov, num_rows, columns, B)
joinkey = []
for i in range(num_group):
    joinkey += [i] * num_rows_per_group 
df["join_key"] = joinkey

# compute the non-private estimator
buyeragg = agg_dataset()
buyeragg.load(df[buyer_columns + ["join_key"]].copy(deep=True), buyer_columns, ["join_key"])
buyeragg.compute_agg()

selleragg = agg_dataset()
selleragg.load(df[seller_columns + ["join_key"]].copy(deep=True), seller_columns, ["join_key"])
selleragg.compute_agg()

join_train = join_agg(selleragg, buyeragg, "join_key")
non_privatized_cov = join_train.sum()
non_privatized_cov = unbiased_est(non_privatized_cov.copy(deep=True), buyer_columns, seller_columns, num_group, num_rows)
non_privatized_parameter = ridge_linear_regression(non_privatized_cov, X, y, ridge)
    
# FDP-OPT 
buyeragg = agg_dataset()
buyeragg.load(df[buyer_columns + ["join_key"]].copy(deep=True), buyer_columns, ["join_key"])
buyeragg.compute_agg()

selleragg = agg_dataset()
selleragg.load(df[seller_columns + ["join_key"]].copy(deep=True), seller_columns, ["join_key"])
selleragg.compute_agg()

sen1, sen2, sen3 = join_sensitivity_opt(B)
r1, r2, r3 = 1/3,1/3,1/3
noise1, noise2, noise3 = compute_std(eps*r1, delta*r1, sen1), compute_std(eps*r2, delta*r2, sen2), compute_std(eps*r3, delta*r3, sen3)

# impute noise
selleragg.agg_dimensions["join_key"]['cov:c'] += np.random.normal(0, noise1, len(selleragg.agg_dimensions["join_key"]))
buyeragg.agg_dimensions["join_key"]['cov:c'] += np.random.normal(0, noise1, len(selleragg.agg_dimensions["join_key"]))

for col in [string for string in selleragg.semi_ring_columns() if string.startswith('cov:s:')]:
    selleragg.agg_dimensions["join_key"][col] += np.random.normal(0, noise2, len(selleragg.agg_dimensions["join_key"]))
    
for col in [string for string in selleragg.semi_ring_columns() if string.startswith('cov:Q:')]:
    selleragg.agg_dimensions["join_key"][col] += np.random.normal(0, noise3, len(selleragg.agg_dimensions["join_key"]))

for col in [string for string in buyeragg.semi_ring_columns() if string.startswith('cov:s:')]:
    buyeragg.agg_dimensions["join_key"][col] += np.random.normal(0, noise2, len(buyeragg.agg_dimensions["join_key"]))

for col in [string for string in buyeragg.semi_ring_columns() if string.startswith('cov:Q:')]:
    buyeragg.agg_dimensions["join_key"][col] += np.random.normal(0, noise3, len(buyeragg.agg_dimensions["join_key"]))

fdp_opt_train = join_agg(selleragg, buyeragg, "join_key")
fdp_opt_train_cov = fdp_opt_train.sum()

est_num_rows = int((fdp_opt_train_cov["cov:c"]/num_group)**0.5*num_group)

if not sanity_check(fdp_opt_train_cov, X, ridge):
    print("fail")
else:
    fdp_opt_cov = unbiased_est(fdp_opt_train_cov.copy(deep=True), buyer_columns, seller_columns, num_group, est_num_rows)
    fdp_opt_parameter = ridge_linear_regression(fdp_opt_cov, X, y, ridge)
    print("beta error", get_l2_distance(fdp_opt_parameter, non_privatized_parameter))

beta error 0.8114082938434813
