In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from efficient_probit_regression.datasets import BaseDataset, Iris, Covertype, Webspam
from efficient_probit_regression.sampling import leverage_score_sampling
from efficient_probit_regression.probit_model import PGeneralizedProbitModel
from efficient_probit_regression import settings

In [None]:
def get_epsilon(k: int, p: int, dataset: BaseDataset):
    full_model = PGeneralizedProbitModel(p=p, X=dataset.get_X(), y=dataset.get_y())
    f = lambda beta: full_model.negative_log_likelihood(beta)

    X_reduced, y_reduced, weights = leverage_score_sampling(
        X = dataset.get_X(), 
        y = dataset.get_y(), 
        sample_size = k, 
        augmented = True, 
        online = False, 
        round_up = True, 
        p = p, 
        fast_approx=True
    )
    reduced_model = PGeneralizedProbitModel(p=p, X=X_reduced, y=y_reduced, w=weights)
    reduced_model.fit()
    beta_reduced = reduced_model.get_params()

    beta_opt = dataset.get_beta_opt(p=p)

    epsilon = np.abs(f(beta_reduced) - f(beta_opt)) / f(beta_opt)

    return epsilon

In [None]:
dataset = Webspam()

eps_min = 0.05
eps_max = 0.10

if dataset.get_name() == "covertype":
    k_min = 500
    k_max = 20000
if dataset.get_name() == "webspam":
    k_min = 500
    k_max = 50000

In [None]:
num_runs = 1

verbose = True

results_list = []
for p in [1, 1.5, 2, 3, 4, 5]:
    for run in range(1, num_runs+1):
        k_min_tmp = k_min
        k_max_tmp = k_max
        while k_max_tmp > k_min_tmp + 1:
            k_new = int((k_max_tmp + k_min_tmp) / 2)
            cur_eps = get_epsilon(k=k_new, p=p, dataset=dataset)
            if verbose:
                print("p", p, "k_min_tmp", k_min_tmp, "k_max_tmp", k_max_tmp, "k_new", k_new, "cur_eps", cur_eps)
            if cur_eps >= eps_min and cur_eps <= eps_max:
                if verbose:
                    print(f"Found! k = {k_new}")
                break
            elif cur_eps <= eps_min:
                k_max_tmp = k_new
            else:
                k_min_tmp = k_new
        k_final = int((k_max_tmp + k_min_tmp) / 2)
        results_list.append({"run": run, "p": p, "k": k_final})

print(results_list)

In [None]:
df = pd.DataFrame(results_list)

filename = settings.RESULTS_DIR / f"{dataset.get_name()}_results_eps.csv"
df.to_csv(filename, index=False)

df

In [None]:
plt.rcParams["text.usetex"] = True
plt.rcParams['text.latex.preamble'] = r'\usepackage{amssymb}'
plt.rc("font", size=15)

sns.lineplot(data=df, x="p", y="k", marker="o")

plt.ylabel("size")
plt.xlabel("$p$")

plt.title(f"{dataset.get_name().capitalize()}, " fr'$\varepsilon \in [{eps_min}, {eps_max}]$', size=23)

plt.tight_layout()

# plt.savefig(settings.PLOTS_DIR / f"{dataset.get_name()}_eps_plot.pdf")

plt.show()