In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from efficient_probit_regression.datasets import BaseDataset, Iris, Covertype, Webspam, KDDCup
from efficient_probit_regression.sampling import leverage_score_sampling
from efficient_probit_regression.probit_model import PGeneralizedProbitModel
from efficient_probit_regression import settings

In [None]:
def get_epsilon_and_beta_dist(k: int, p: int, dataset: BaseDataset):
    full_model = PGeneralizedProbitModel(p=p, X=dataset.get_X(), y=dataset.get_y())
    f = lambda beta: full_model.negative_log_likelihood(beta)

    X_reduced, y_reduced, weights = leverage_score_sampling(
        X = dataset.get_X(), 
        y = dataset.get_y(), 
        sample_size = k, 
        augmented = True, 
        online = False, 
        round_up = True, 
        p = p, 
        fast_approx=True
    )
    reduced_model = PGeneralizedProbitModel(p=p, X=X_reduced, y=y_reduced, w=weights)
    reduced_model.fit()
    beta_reduced = reduced_model.get_params()

    beta_opt = dataset.get_beta_opt(p=p)

    epsilon = np.abs(f(beta_reduced) - f(beta_opt)) / f(beta_opt)

    beta_l2 = np.linalg.norm(beta_opt - beta_reduced, ord=2)
    beta_inf = np.linalg.norm(beta_opt - beta_reduced, ord=np.inf)

    return epsilon, beta_l2, beta_inf

In [None]:
# dataset = Webspam()
dataset = Covertype()
# dataset = KDDCup()
# dataset = Iris()

eps_min = 0.05
eps_max = 0.1

if dataset.get_name() == "covertype":
    k_min = 500
    k_max = 50000
if dataset.get_name() == "webspam":
    k_min = 500
    k_max = 50000
if dataset.get_name() == "kddcup":
    k_min = 500
    k_max = 50000
if dataset.get_name() == "iris":
    k_min = 20
    k_max = 150

In [None]:
num_runs = 5

verbose = True

results_list = []
# p_list = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]
p_list = [1, 1.5, 2, 3, 4, 5]
for p in p_list:
    for run in range(1, num_runs+1):
        k_min_tmp = k_min
        k_max_tmp = k_max
        while k_max_tmp > k_min_tmp + 1:
            k_new = int((k_max_tmp + k_min_tmp) / 2)
            cur_eps, beta_l2, beta_inf = get_epsilon_and_beta_dist(k=k_new, p=p, dataset=dataset)
            if verbose:
                print("run", run, "p", p, "k_min_tmp", k_min_tmp, "k_max_tmp", k_max_tmp, "k_new", k_new, "cur_eps", cur_eps, "l2", beta_l2, "inf", beta_inf)
            if cur_eps >= eps_min and cur_eps <= eps_max:
                if verbose:
                    print(f"Found! k = {k_new}")
                break
            elif cur_eps <= eps_min:
                k_max_tmp = k_new
            else:
                k_min_tmp = k_new
        results_list.append({"run": run, "p": p, "k": k_new, "beta_l2": beta_l2, "beta_inf": beta_inf})

print(results_list)

In [None]:
df = pd.DataFrame(results_list)

filename = settings.RESULTS_DIR / f"{dataset.get_name()}_results_eps.csv"
# df.to_csv(filename, index=False)

df

In [None]:
filename = settings.RESULTS_DIR / f"{dataset.get_name()}_results_eps.csv"
df = pd.read_csv(filename)
df

In [None]:
p_list = [1, 2, 3, 4, 5]
df = df.query("p in @p_list")

In [None]:
plt.rcParams["text.usetex"] = True
plt.rcParams['text.latex.preamble'] = r'\usepackage{amssymb}'
plt.rc("font", size=15)

# sns.scatterplot(data=df, x="p", y="k")
sns.lineplot(data=df, x="p", y="k", marker="o", estimator=np.min, ci=None)
# sns.boxplot(data=df, x="p", y="k")

plt.ylabel("size")
plt.xlabel("$p$")

plt.title(f"{dataset.get_name().capitalize()}, " fr'$\varepsilon \in [{eps_min}, {eps_max}]$', size=23)

plt.tight_layout()

# plt.savefig(settings.PLOTS_DIR / f"{dataset.get_name()}_plot_min_k_p_reduced.pdf")
plt.savefig(settings.PLOTS_DIR / "plot_min_k_p_reduced.pdf")

plt.show()

In [None]:
sns.scatterplot(data=df.rename(columns={"beta_l2": "l2"}), x="p", y="l2")
# plt.savefig(settings.PLOTS_DIR / "l2.pdf")

In [None]:
sns.scatterplot(data=df.rename(columns={"beta_inf": "inf"}), x="p", y="inf")
# plt.savefig(settings.PLOTS_DIR / "inf.pdf")

In [None]:
plt.rcParams["text.usetex"] = True
plt.rcParams['text.latex.preamble'] = r'\usepackage{amssymb}'
plt.rc("font", size=15)


divisor = np.where(df["p"] == 1, df["p"], df["p"] * np.log(df["p"]))
sns.lineplot(x=df["p"], y=df["k"] / divisor, marker="o", ci=None, estimator=np.min)
# sns.lineplot(x=df["p"], y=df["k"] / (np.power(df["p"], 1.5)), marker="o", ci=None, estimator=np.median)
# sns.lineplot(x=df["p"], y=df["k"] / np.power(df["p"], 2), marker="o", ci=None, estimator=np.median)
# sns.lineplot(x=df["p"], y=df["k"] / np.power(df["p"], 3), marker="o", ci=None, estimator=np.median)
# sns.lineplot(x=df["p"], y=df["k"] / df["p"], marker="o", ci=None, estimator=np.median)

plt.xlabel("$p$")

plt.ylabel("size / $(p \log{p})$")
# plt.ylabel("size / $p^{1.5}$")
# plt.ylabel("size / $p^2$")
# plt.ylabel("size / $p^3$")
# plt.ylabel("size / $p$")

plt.title(f"{dataset.get_name().capitalize()}, " fr'$\varepsilon \in [{eps_min}, {eps_max}]$', size=23)

plt.tight_layout()

plt.savefig(settings.PLOTS_DIR / "eps-p-log-p-adjusted.pdf")
# plt.savefig(settings.PLOTS_DIR / "eps-p-1.5.pdf")
# plt.savefig(settings.PLOTS_DIR / "eps-p-2.pdf")
# plt.savefig(settings.PLOTS_DIR / "eps-p-3.pdf")
# plt.savefig(settings.PLOTS_DIR / "eps-p.pdf")

In [None]:
np.log(3)