In [11]:
import torch
from synthetic_exp import *
from test_utils import *
from scipy import stats
from sketch_search import JoinSketch
from bivariate_estimator import BivariateEstimator

In [2]:
runs = 100
num_nodes = [10, 50, 100, 500]
mi_thresholds = [0.01, 0.02, 0.05, 0.1]
gen_synthetic_data(runs, num_nodes, 10000)

In [None]:
# Change gpu to True for GPU-version of Suna
res = accuracy_exp(
    runs, num_nodes, mi_thresholds, hist=False,
    fhm=True, bootstrap=False, gpu=False
)

res = accuracy_exp(
    runs, num_nodes, mi_thresholds, hist=True,
    fhm=False, bootstrap=False, gpu=False
)

res = accuracy_exp(
    runs, num_nodes, mi_thresholds, hist=False,
    fhm=True, bootstrap=False, gpu=True
)

res = accuracy_exp(
    runs, num_nodes, mi_thresholds, hist=True,
    fhm=False, bootstrap=False, gpu=True
)

In [13]:
num_samples = [100, 1000, 5000, 10000, 50000]
deg = 1
for num_sample in num_samples:
    dp = DataProfile(seed=0)
    dp.generate_G(50)
    dp.generate_D_from_G(num_samples=num_sample)
    i, j = random_pair(50)
    att1 = [f'V{i}']
    att2 = [att for att in dp.D.columns if att not in {f'V{i}', f'V{j}', 'join_key'}][:10]
    sketch_1 = JoinSketch(join_key_domain=dp.join_key_domain)
    sketch_1.register_df(1, dp.D[att1 + ['join_key']], att1, deg=2)
    sketch_2 = JoinSketch(join_key_domain=dp.join_key_domain)
    sketch_2.register_df(2, dp.D[att2 + ['join_key']], att2, deg=2)
    msr1 = sketch_1.sketch_loader.batch_sketches[0]
    msr2 = sketch_2.sketch_loader.batch_sketches[0]
    X = torch.tensor(dp.D[att1].values, dtype=torch.float32)
    Y = torch.tensor(dp.D[att2].values, dtype=torch.float32)
    linearHistMI = FactorizedLinearHistMI()
    be = BivariateEstimator(degree=deg, method=linearHistMI)
    # currently ugly-coded to save to pickles
    mi_diff, r2, _ = be.compute_mi(msr1, msr2, X, Y, std=True)
    H_x, H_res_y, _, _, gt_hist = hist_mi_gt(dp.D[att1 + ['join_key']], dp.D[att2 + ['join_key']], deg, std=True)
    with open(f'hist_{len(dp.D)}.pkl', 'rb') as file:
        est_hist, est_ent = pickle.load(file)
    results = []
    for i in range(len(gt_hist)):
        tensor_hist = est_hist[:, i].numpy()
        array_hist = gt_hist[i]

        if len(tensor_hist) < len(array_hist):
            tensor_hist = np.pad(tensor_hist, (0, len(array_hist) - len(tensor_hist)))
        elif len(array_hist) < len(tensor_hist):
            array_hist = np.pad(array_hist, (0, len(tensor_hist) - len(array_hist)))

        tensor_hist = tensor_hist / np.sum(tensor_hist)
        array_hist = array_hist / np.sum(array_hist)

        epsilon = 1e-10
        tensor_hist = np.clip(tensor_hist, epsilon, None)
        array_hist = np.clip(array_hist, epsilon, None)

        kl_div = stats.entropy(tensor_hist, array_hist)
        results.append(kl_div)
    print(np.mean(results))
    print(np.mean(np.abs(est_ent.numpy() - H_res_y)))

0.05400228122163471
0.03433981579133163
0.020600831504375933
0.014768202614067194
0.01050403187609537
0.007544159105812964
0.00886598226859952
0.007640318892029896
0.00575353040192979
0.0030481184384225646
