In [None]:
from efficient_probit_regression.datasets import Covertype
from efficient_probit_regression.sampling import gibbs_sampler_probit
from efficient_probit_regression import settings
from efficient_probit_regression.sampling import leverage_score_sampling, compute_leverage_scores

import numpy as np
import plotly.graph_objects as go
import pandas as pd

from tqdm import tqdm

In [None]:
dataset = Covertype()
X = dataset.get_X()
y = dataset.get_y()
n = dataset.get_n()
d = dataset.get_d()
beta_opt = dataset.get_beta_opt()

In [None]:
min_size = 500
max_size = 15000
step_size = 500

sizes = np.arange(start=min_size, stop=max_size+step_size, step=step_size)

In [None]:
prior_mean = np.zeros(d)
prior_cov = 10 * np.eye(d)

samples_per_chain = 250

samples = []
leverage_scores = compute_leverage_scores(X)
for cur_size in tqdm(sizes):
    X_reduced, y_reduced, _ = leverage_score_sampling(
        X=X, 
        y=y, 
        sample_size = cur_size, 
        augmented = True, 
        online = False, 
        round_up = True, 
        precomputed_scores = leverage_scores
    )
    cur_sample = gibbs_sampler_probit(
        X=X_reduced, 
        y=y_reduced, 
        prior_mean=prior_mean, 
        prior_cov=prior_cov, 
        num_samples=samples_per_chain, 
        num_chains=4, 
        burn_in=100
    )
    samples.append({
        "size": cur_size, 
        "sample": cur_sample
    })

In [None]:
df_list = []
for cur_sample in samples:
    cur_df = pd.DataFrame(cur_sample["sample"], columns=[f"beta_{i}" for i in range(d)])
    cur_df["size"] = cur_sample["size"]
    df_list.append(cur_df)

df = pd.concat(df_list, ignore_index=True)

df.to_csv(settings.RESULTS_DIR / f"{dataset.get_name()}_reduced_samples.csv", index=False)

df