In [None]:
from efficient_probit_regression.datasets import Iris, Covertype, KDDCup, Webspam
from efficient_probit_regression.sampling import compute_leverage_scores, leverage_score_sampling, uniform_sampling
from efficient_probit_regression import settings

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np
import pandas as pd

In [None]:
# dataset = Iris()
# dataset = Covertype()
# dataset = KDDCup()
dataset = Webspam()

X = dataset.get_X()
y = dataset.get_y()

scores = compute_leverage_scores(X, p=2, fast_approx=False)

In [None]:
pca = PCA(n_components=2)
X_new = pca.fit_transform(X)

In [None]:
sample_size = 500

np.random.seed(1)

X_leverage = X_new[np.random.choice(dataset.get_n(), size=sample_size, replace=False, p = scores / np.sum(scores))]
X_uniform = X_new[np.random.choice(dataset.get_n(), size=sample_size, replace=False)]

plot_df = pd.concat([
    pd.DataFrame({"X1": X_leverage[:,0], "X2": X_leverage[:,1], "method": "leverage"}),
    pd.DataFrame({"X1": X_uniform[:,0], "X2": X_uniform[:,1], "method": "uniform"}),
], ignore_index=True)

plot_df.head()

In [None]:
# use TeX for typesetting
plt.rcParams["text.usetex"] = True
plt.rc("font", size=15)

fig, ax = plt.subplots()

sns.scatterplot(data=plot_df, x="X1", y="X2", hue="method", ax=ax)

ax.set_xlabel("PC1")
ax.set_ylabel("PC2")

ax.set_title(f"{dataset.get_name().capitalize()} PCA")

ax.legend(loc="upper right", frameon=True, title="method")

filename = f"{dataset.get_name()}_pca.pdf"

webspam_zoom = False
if webspam_zoom:
    ax.set_xlim(left=-0.5, right=2)
    ax.set_ylim(bottom=-10, top=25)
    ax.set_title(f"Webspam PCA Zoomed")
    filename = "webspam_pca_zoomed.pdf"

fig.savefig(settings.PLOTS_DIR / filename)

fig.show()