In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math 

import seaborn as sns

import hyperparams, model, dataloader
from utils import results_to_df

In [None]:
%matplotlib inline
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
dataset_name = "parkinsons"

In [None]:
X, Xtest, y, ytest = dataloader.load_data(dataset_name)

In [None]:
y.mean()

In [None]:
ytest.sum()

In [None]:
hyperparam_per_algo = hyperparams.BY_DATASET[dataset_name]

# Run train, predict, sample, evaluate Active Learning loop.

We start with `absolute loss based sampling` and `random sampling`.

We cannot control how many points absloss will sample. Hence, we first sample based on absolute loss, then we calculate the sampling rate from the results, and then we use that sampling rate when we apply random sampling for a fair comparison.

In [None]:
# We have stored results from our hyperparameter tuning run. Here we load the results, and will use the best hyperparameters per method.
hyperparam_per_algo = hyperparams.BY_DATASET[dataset_name]

In [None]:
absloss_params = hyperparam_per_algo["absloss"]

results_absloss = model.sample_train_evaluate_loop(
    X, y, Xtest, ytest, mode="absloss", **absloss_params
)

In [None]:
absloss_mean_sampling_prob = np.mean(results_absloss["probs"])

In [None]:
absloss_mean_sampling_prob

In [None]:
rand_params = hyperparam_per_algo["random"]

results_rand = model.sample_train_evaluate_loop(
    X, y, Xtest, ytest, mode="random", verbose=True, **(rand_params | {"pz0": absloss_mean_sampling_prob})
)

We now run the Active Learning loop for `Polyak absloss`. We aim to match the sampling probability that we achieved with `absloss_mean_sampling_prob`.

We do this by using Polyak's omega parameter to scale up or down the sampling probabilities. We continue these omega adjustments until the mean sampling probability of `absloss_mean_sampling_prob` matches the mean sampling probability that we obtained with `absloss`.

In [None]:
pz_epsilon = 0.001

results_polyak_absloss = None
while results_polyak_absloss is None or abs(np.mean(results_absloss["probs"]) - np.mean(results_polyak_absloss["probs"])) > pz_epsilon:
    results_polyak_absloss = model.sample_train_evaluate_loop(
        X, y, Xtest, ytest, mode="polyak_absloss", verbose=True, 
        **hyperparam_per_algo["polyak_absloss"]
    )
    hppa = hyperparam_per_algo["polyak_absloss"]
    hppa["omega"] *= np.mean(results_absloss["probs"]) / np.mean(results_polyak_absloss["probs"])
    hyperparam_per_algo["polyak_absloss"] = hppa

We now run the Active Learning loop for `Polyak random`. For a fair comparison we again need to match the sampling rate to the one that we observed for absloss. Since Polyak random applies random sampling, we can simply set the sampling rate with `pz0` like we did for random sampling.

In [None]:
results_polyak_random = model.sample_train_evaluate_loop(
    X, y, Xtest, ytest, mode="polyak_random", verbose=True, 
    **hyperparam_per_algo["polyak_absloss"] | {"pz0": absloss_mean_sampling_prob}
)

In [None]:
results_absloss_lr = model.sample_train_evaluate_loop(
    X, y, Xtest, ytest, mode="absloss-lr-refit", verbose=True, 
    **hyperparam_per_algo["absloss_lr"]
)

In [None]:
results_polyak_absloss_lr = model.sample_train_evaluate_loop(
    X, y, Xtest, ytest, mode="polyak_absloss-lr-refit", verbose=True, 
    **hyperparam_per_algo["polyak_absloss_lr"]
)

# Create figures

In [None]:
plot_df = results_to_df(
    {
        "random": results_rand,
        "absloss": results_absloss,
        "polyak_absloss": results_polyak_absloss,
        "polyak_random": results_polyak_random
    }
)

In [None]:
plt.figure(figsize=(10,4))

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'xx-large',
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)

ax = sns.lineplot(
    data=plot_df,
    x="iteration", 
    y="mean_train_loss",
    hue="method",
    linewidth=2
)
legend = ax.get_legend()
legend.set_title('')
ax.set_xlim((0, 140))

ax.set(ylabel="average \ncross-entropy loss")

plt.savefig(f"figure1_{dataset_name}.pdf", format="pdf", bbox_inches="tight")


In [None]:
# This figure plots the number of sampled points (i.e. the cost) against the loss. 
# Note the cost may be slightly different for different methods, this is because we only fixed the sampling rate *in expectation* by holding the average sampling probability constant
# The actual realized number of sampled points may still vary by method. Note that when these differ too much then the results are likely unreliable and it is wise to re-run. 

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'xx-large',
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)

plt.figure(figsize=(12,5))

ax = sns.lineplot(
    data=plot_df,
    x="cost", 
    y="mean_train_loss",
    hue="method",
    linewidth=2
)

ax.set(ylabel="average cross entropy loss")

legend = ax.get_legend()
legend.set_title('')

plt.savefig(f"figure5_{dataset_name}.pdf", format="pdf", bbox_inches="tight")

In [None]:
plot_df = plot_df[plot_df["method"].isin(["absloss", "polyak_absloss"])]

In [None]:
plt.figure(figsize=(12,5))

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'xx-large',
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)

ax = sns.lineplot(
    data=plot_df,
    x="iteration", 
    y="mean_test_loss",
    hue="method",
    linewidth=2
)
ax.set_xlim((0, 140))

legend = ax.get_legend()
legend.set_title('')

ax.set(ylabel="average test set loss")

plt.savefig(f"figure6_icml_{dataset_name}.pdf", format="pdf", bbox_inches="tight")

In [None]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'xx-large',
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)

plt.figure(figsize=(12,5))

ax = sns.lineplot(
    data=plot_df,
    x="iteration", 
    y="mean_test_accuracy",
    hue="method",
    linewidth=2
)
ax.set_xlim((0, 140))
ax.set(ylabel="average test set accuracy")

legend = ax.get_legend()
legend.set_title('')

plt.savefig(f"figure7_{dataset_name}.pdf", format="pdf", bbox_inches="tight")

In [None]:
plot_df = results_to_df(
    {
        "polyak_absloss": results_polyak_absloss,
        "polyak_absloss_estimator": results_polyak_absloss_lr
    }
)

In [None]:
plt.figure(figsize=(10,4))

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'xx-large',
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)

ax = sns.lineplot(
    data=plot_df,
    x="iteration", 
    y="mean_train_loss",
    hue="method",
    linewidth=2
)
legend = ax.get_legend()
legend.set_title('')
ax.set_xlim((0, 140))

ax.set(ylabel="average \ncross-entropy loss")

plt.savefig(f"figure2_{dataset_name}.pdf", format="pdf", bbox_inches="tight")

In [None]:
plt.figure(figsize=(10,4))

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'xx-large',
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)

ax = sns.lineplot(
    data=plot_df,
    x="iteration", 
    y="cost",
    hue="method",
    linewidth=2
)
legend = ax.get_legend()
legend.set_title('')

ax.set(ylabel="sampled labels")

plt.savefig(f"figure8_{dataset_name}.pdf", format="pdf", bbox_inches="tight")