In [1]:
from al.sampling.uncert import off_centered_entropy, entropy_info
from al.sampling.uncert.classification.prior import ClassWeightedEntropy
from al.sampling.baseline import random
from al.sampling.combination import InfoEnsemble, ProductAggregation, SumAggregation
from al.sampling.repr.pr_density import PrDensity
from al.sampling.repr.knn import k_nearest_neighbor_repr
from al.sampling.kernels import UniformKernel, GaussianKernel, ConstantBandwidth, SilvermanTactic, ScottTactic
from al.sampling.qbc import BootstrapJS
import torch
from al.distances import L2Distance, JensenShannonDistance
from al.plot.results import plot_metric

from al.loops.base import ALDataset, LoopConfig, LoopMetric
from al.loops.experiments import ExperimentResults, NClassesGuaranteeWrapper, XGBWrapper, create_AL_dataset_from_openml, run_experiment
from al.loops.perfect_oracle import active_learning_loop
import xgboost
from al.kde import NaiveKDE, ClusteringKDE
from torch.utils.data import TensorDataset
import pandas as pd
from al.base import ImbalancedModelPriorPredictTactic

import functools

In [3]:
torch.set_default_device('cuda')

torch.manual_seed(128)

<torch._C.Generator at 0x2437f969250>

In [2]:
def load_firefighters():
    path_prefix = '../../datasets/firefighters/'
    data_files = ['train.csv', 'test.csv']
    X = pd.concat([pd.read_csv(path_prefix + path, header=0) for path in data_files], axis=0)
    y = X["labelPose"].astype('category')
    X = X.drop(columns=["labelPose", "labelAction"])
    X = torch.tensor(X.to_numpy())
    y = torch.tensor(y.cat.codes.to_numpy(), dtype=torch.int64)
    return ALDataset(TensorDataset(X, y.squeeze()))

def load_sod():
    path_prefix = '../../datasets/sod/'
    x_files = ['x_loop.csv', 'x_experts_kb.csv', 'x_test.csv']
    y_files = ['y_loop.csv', 'y_experts_kb.csv', 'y_test.csv']
    X = pd.concat([pd.read_csv(path_prefix + path, header=None) for path in x_files], axis=0)
    y = pd.concat([pd.read_csv(path_prefix + path, header=None) for path in y_files], axis=0)

    X = torch.tensor(X.to_numpy())
    y = torch.tensor(y.to_numpy())
    return ALDataset(TensorDataset(X, y.squeeze()))

In [5]:
# To run experiment with appropriate dataset uncomment the selected line and comment out others.

# To obtain firefighters or sod dataset please contact knowledgepit.ml platform administrators,
# data from those datasets should be placed in datasets folder in the main repo as indicated
# by functions defined in the previous cell.

# datasets with experiments with NaiveKDE

# dataset, save_path = ALDataset(create_AL_dataset_from_openml(554).dataset), './mnist_final.bin'
# dataset, save_path = load_firefighters(), './firefighters_final.bin'

# datasets with experiments with ClusterringKDE

dataset, save_path = ALDataset(create_AL_dataset_from_openml(1596).dataset), './covertype_final.bin'
# dataset, save_path = ALDataset(create_AL_dataset_from_openml(182).dataset), './satimage_final.bin'
# dataset, save_path = ALDataset(create_AL_dataset_from_openml(6).dataset), './letter_final.bin'

# datasets with custom hyperparams 
# dataset, save_path = load_sod(), './sod_final.bin'





In [18]:
priors = torch.bincount(dataset.targets) / torch.bincount(dataset.targets).sum()
priors = [prior.item() for prior in priors]
priors_formated = " ".join([f"{prior:.0%}" for prior in priors])
priors_formated

'36% 49% 6% 0% 2% 3% 4%'

In [19]:
dataset_desc = f"{len(dataset) *0.5 -200 :.0f} & {len(dataset)* 0.5 :.0f} & {dataset.features.shape[1]} & {len(priors)} & " + priors_formated
print(dataset_desc)

290306 & 290506 & 98 & 7 & 36% 49% 6% 0% 2% 3% 4%


In [3]:
gausian_kernel = GaussianKernel()
uniform_kernel = UniformKernel()
l2_distance = L2Distance()


kde_estimator =   functools.partial(ClusteringKDE, index_device=torch.device("cpu")) 

# Uncomment for naive KDE implementation
# kde_estimator = NaiveKDE

knn_entropy = InfoEnsemble([entropy_info, k_nearest_neighbor_repr], aggregation_tactic=ProductAggregation())

# FALC methods
proba_density_silverman = PrDensity(kernel=uniform_kernel, distance=l2_distance, kde_class=kde_estimator,
                                      bandwidth_tactic=SilvermanTactic())
proba_density_scott = PrDensity(kernel=uniform_kernel, distance=l2_distance, kde_class=kde_estimator,
                                      bandwidth_tactic=ScottTactic())
proba_density_silverman_gauss = PrDensity(kernel=gausian_kernel, distance=l2_distance, kde_class=kde_estimator,
                                      bandwidth_tactic=SilvermanTactic())
proba_density_scott_gauss = PrDensity(kernel=gausian_kernel, distance=l2_distance, kde_class=kde_estimator,
                                      bandwidth_tactic=ScottTactic())

pr_density_aggregation = ProductAggregation()
class_weighted_entropy = ClassWeightedEntropy() # TODO rename to also include proba?
proba_density_entropy_silverman = InfoEnsemble([class_weighted_entropy, proba_density_silverman], aggregation_tactic=pr_density_aggregation)
proba_density_entropy_scott = InfoEnsemble([class_weighted_entropy, proba_density_scott], aggregation_tactic=pr_density_aggregation)
proba_density_entropy_silverman_gauss = InfoEnsemble([class_weighted_entropy, proba_density_silverman_gauss], aggregation_tactic=pr_density_aggregation)
proba_density_entropy_scott_gauss = InfoEnsemble([class_weighted_entropy, proba_density_scott_gauss], aggregation_tactic=pr_density_aggregation)


qbc = BootstrapJS(n_models=5)


uncerts = [proba_density_entropy_scott, proba_density_entropy_silverman, proba_density_entropy_silverman_gauss, proba_density_entropy_scott_gauss,
            knn_entropy, qbc, random, entropy_info, off_centered_entropy]

In [21]:
for uncert in uncerts:
    print(uncert.name)

EnsembleClassWeightedEntropy1.0_ProbaDensityUniformKernelL2DistanceScottTactic1.0
EnsembleClassWeightedEntropy1.0_ProbaDensityUniformKernelL2DistanceSilvermanTactic1.0
EnsembleClassWeightedEntropy1.0_ProbaDensityGaussianKernelL2DistanceSilvermanTactic1.0
EnsembleClassWeightedEntropy1.0_ProbaDensityGaussianKernelL2DistanceScottTactic1.0
EnsembleEntropy1.0_KNearestNeighborRepr1.0
BootstrapJS
Random
Entropy
OffCenteredEntropy


In [22]:
config = LoopConfig(metrics=[LoopMetric.BAC, LoopMetric.BAC_from_predict])
if save_path.startswith("./sod"):
    print("using redefined params")
    xgbmodel =  xgboost.XGBClassifier(random_state=42, tree_method='hist', objective='binary:logistic', n_jobs=16, device='cpu',
                                       eta=0.1, base_score=0.06) # positive samples are ~6% of data, this comes from domain knowledge
else:
    xgbmodel =  xgboost.XGBClassifier(random_state=42, tree_method='hist', objective='multi:softprob', n_jobs=16, device='cpu', eta=0.1)
model = NClassesGuaranteeWrapper(
    XGBWrapper(xgbmodel,
        predict_tactic=ImbalancedModelPriorPredictTactic()
        ), 
    n_classes=dataset.n_classes)

In [None]:
config_results = run_experiment(active_learning_loop, data=dataset, infos=uncerts, config=config, model=model, init_frac=200, save_path=save_path, budget=50, n_repeats=10)

In [None]:
import cProfile
cProfile.run('run_experiment(active_learning_loop, data=dataset, infos=uncerts, config=config, model=model, init_frac=100, save_path="perftest.bin", budget=30, n_repeats=1)')

In [11]:

names_mapping = {proba_density_entropy_scott.name: '$FALC_u^{sc}$',
 proba_density_entropy_silverman.name: '$FALC_u^{si}$',
 proba_density_entropy_silverman_gauss.name: '$FALC_g^{si}$',
 proba_density_entropy_scott_gauss.name: '$FALC_g^{sc}$',
 knn_entropy.name: 'KNN-Entropy',
 qbc.name: 'BootstrapJS',
 random.name: 'Random',
 entropy_info.name: 'Entropy',
 off_centered_entropy.name: 'Off-Centered Entropy'}

In [4]:
all_results_paths = ['./mnist_final.bin', './firefighters_final.bin', 
                  './covertype_final.bin', './satimage_final.bin', 
                  './letter_final.bin', './sod_final.bin']

In [None]:
import matplotlib.pyplot as plt

for save_path in all_results_paths:
    config_results = ExperimentResults.load(save_path)
    for metric in (LoopMetric.BAC_from_predict.name, LoopMetric.BAC.name):
        plot_metric(config_results, metric, add_mean_to_legened=True, names_mapping=names_mapping)
        plt.legend(bbox_to_anchor=(1.02, 1.0), loc='upper left', borderaxespad=0)
        plt.xlabel("Iteration number")
        if metric == LoopMetric.BAC_from_predict.name:
            plt.ylabel("$BAC^{mp}$")
        else:
            plt.ylabel("$BAC$")
        save_path_stripped = save_path.removesuffix('.bin').removeprefix("./")
        plt.savefig(f"images/{metric}_{save_path_stripped}.png", bbox_inches="tight")
        plt.show()


In [10]:
torch.set_default_device('cpu')
torch.get_num_threads()

24

In [23]:

uniform_kernel = UniformKernel()
l2_distance = L2Distance()

clustering_kde_estimator =   functools.partial(ClusteringKDE, index_device=torch.device("cpu")) # NaiveKDE

# current implemenmtation of names do not include kde_estimators therefore we have to
# patch them to make them distinguishible 

# FALC methods
proba_density_clustering = PrDensity(kernel=uniform_kernel, distance=l2_distance, kde_class=clustering_kde_estimator,
                                      bandwidth_tactic=SilvermanTactic())
pr_density_aggregation = ProductAggregation()
class_weighted_entropy = ClassWeightedEntropy() # TODO rename to also include proba?
falc_clussteing_kde =  InfoEnsemble([class_weighted_entropy, proba_density_clustering], aggregation_tactic=pr_density_aggregation)

proba_density_naive = PrDensity(kernel=uniform_kernel, distance=l2_distance, kde_class=NaiveKDE,
                                      bandwidth_tactic=SilvermanTactic())

falc_naive_kde = InfoEnsemble([class_weighted_entropy, proba_density_naive], aggregation_tactic=pr_density_aggregation)

from unittest.mock import patch, PropertyMock

InfoEnsemble.name = PropertyMock()
InfoEnsemble.name = property(lambda self: "NaiveKDE" if self is falc_naive_kde else "ClusteringKDE")


uncerts = [falc_naive_kde, falc_clussteing_kde, entropy_info]

In [24]:
for uncert in uncerts:
    print(uncert.name)

NaiveKDE
ClusteringKDE
Entropy


In [25]:
save_path = None # "./performance_times_covertype.bin"

In [None]:
config = LoopConfig(return_info_times = True)

config_results = run_experiment(active_learning_loop, data=dataset, infos=uncerts, config=config, model=model, init_frac=200, test_frac=0.5, save_path=save_path, budget=1, n_repeats=10)


In [None]:
config_results

In [None]:
from al.plot.results import plot_info_times
import matplotlib.pyplot as plt

config_results = ExperimentResults.load(save_path)

plot_info_times(config_results)
plt.savefig(f"images/performance_comparison.png", bbox_inches="tight")



In [29]:
for result, val in config_results.res.items():
    print(result)
    print(val.info_times.mean())
    print(val.info_times.std())

NaiveKDE
tensor(2.9858, device='cuda:0')
tensor(0.1027, device='cuda:0')
ClusteringKDE
tensor(3.6132, device='cuda:0')
tensor(0.8075, device='cuda:0')
Entropy
tensor(0.1657, device='cuda:0')
tensor(0.0164, device='cuda:0')


In [18]:
def compute_bac_pred_over_bac(results_paths):
    bac_residuals = []
    bac_pred_better = []

    for save_path in results_paths:
        results = ExperimentResults.load(save_path)
        for config_name, config_res in results.res.items():
            bac_pred = config_res.metrics[LoopMetric.BAC_from_predict.name]
            bac_original = config_res.metrics[LoopMetric.BAC.name]
            residual = bac_pred - bac_original
            bac_residuals.append(residual)
            bac_pred_better.append((residual > 0).float())

    bac_residuals = torch.stack(bac_residuals)
    bac_pred_better = torch.stack(bac_pred_better)
    print(bac_residuals.max())
    print(bac_residuals.min())
    print(bac_pred_better.mean(), bac_residuals.mean(), bac_residuals.std())

In [19]:

compute_bac_pred_over_bac(all_results_paths)

tensor(0.1944)
tensor(-0.0044)
tensor(0.9800) tensor(0.0264) tensor(0.0414)


In [20]:
imbalanced_results_paths = [ './firefighters_final.bin',  './covertype_final.bin', './sod_final.bin']

In [21]:
compute_bac_pred_over_bac(imbalanced_results_paths)

tensor(0.1944)
tensor(0.0013)
tensor(1.) tensor(0.0501) tensor(0.0480)


In [23]:
compute_bac_pred_over_bac(set(all_results_paths) - set(imbalanced_results_paths))

tensor(0.0195)
tensor(-0.0044)
tensor(0.9600) tensor(0.0026) tensor(0.0021)


In [5]:
from al.loops.stats import friedman_difference_test, pairwise_wilcoxon_test, get_ranks_for_experiment
all_results = [ExperimentResults.load(path) for path in all_results_paths]
friedman_difference_test(all_results, LoopMetric.BAC_from_predict.name)

1.6437276147240704e-21

In [6]:
configs_to_test = [
    proba_density_entropy_scott.name, proba_density_entropy_silverman.name, 
    proba_density_entropy_silverman_gauss.name, proba_density_entropy_scott_gauss.name
]


In [8]:
pairwise_test_results = pairwise_wilcoxon_test(all_results, LoopMetric.BAC_from_predict.name, pvalue_adjustment_method='by', alternative='greater', configs_to_test=configs_to_test,
                                               include_self_comparison=False)
pairwise_p_values = pairwise_test_results.pvalues
pairwise_configs_names = pairwise_test_results.config_names

def bold(str_val):
    return r"\textbf{" + str_val + "}"

def latex_tab_print(val):
    # check if this is nan
    if val != val:
        return "-"
    
    res = f"{val : 0.3f}"
    
    if val > 0.05:
        return res
    
    return bold(res)

def latex_significance_tab_print(val):
    # check if this is nan
    if val != val:
        return "-"

    if val > 0.1:
        return " "
    
    if val > 0.05:
        return "*"
    
    if val > 0.01:
        return "**"
    
    if val < 0.01:
        return bold("***")

for line in pairwise_p_values[:4]:
    print(*[latex_significance_tab_print(val) for val in line.tolist()], sep=" & ", end=" \\\\ \n")

- & - & - & - & \textbf{***} & \textbf{***} & \textbf{***} & * & \textbf{***} \\ 
- & - & - & - & \textbf{***} & \textbf{***} & \textbf{***} & \textbf{***} & \textbf{***} \\ 
- & - & - & - & \textbf{***} & \textbf{***} & \textbf{***} &   & \textbf{***} \\ 
- & - & - & - & \textbf{***} & \textbf{***} & \textbf{***} & * & \textbf{***} \\ 


In [10]:
pairwise_configs_names

['EnsembleClassWeightedEntropy1.0_ProbaDensityUniformKernelL2DistanceScottTactic1.0',
 'EnsembleClassWeightedEntropy1.0_ProbaDensityUniformKernelL2DistanceSilvermanTactic1.0',
 'EnsembleClassWeightedEntropy1.0_ProbaDensityGaussianKernelL2DistanceSilvermanTactic1.0',
 'EnsembleClassWeightedEntropy1.0_ProbaDensityGaussianKernelL2DistanceScottTactic1.0',
 'EnsembleEntropy1.0_KNearestNeighborRepr1.0',
 'BootstrapJS',
 'Random',
 'Entropy',
 'OffCenteredEntropy']

In [9]:
pairwise_p_values

tensor([[       nan,        nan,        nan,        nan, 1.8863e-05, 1.4200e-06,
         4.3902e-07, 8.1358e-02, 7.5151e-04],
        [       nan,        nan,        nan,        nan, 1.0290e-05, 4.3902e-07,
         4.3902e-07, 2.9882e-03, 2.6179e-05],
        [       nan,        nan,        nan,        nan, 4.5893e-05, 1.6234e-05,
         2.2700e-06, 2.6075e-01, 4.1061e-03],
        [       nan,        nan,        nan,        nan, 1.1773e-05, 8.4918e-07,
         9.0139e-07, 9.1848e-02, 2.0473e-03],
        [       nan,        nan,        nan,        nan,        nan,        nan,
                nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan,        nan,
                nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan,        nan,
                nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan,        nan,
                nan,        na

In [5]:
pairwise_configs_names

['EnsembleClassWeightedEntropy1.0_ProbaDensityUniformKernelL2DistanceScottTactic1.0',
 'EnsembleClassWeightedEntropy1.0_ProbaDensityUniformKernelL2DistanceSilvermanTactic1.0',
 'EnsembleClassWeightedEntropy1.0_ProbaDensityGaussianKernelL2DistanceSilvermanTactic1.0',
 'EnsembleClassWeightedEntropy1.0_ProbaDensityGaussianKernelL2DistanceScottTactic1.0',
 'EnsembleEntropy1.0_KNearestNeighborRepr1.0',
 'BootstrapJS',
 'Random',
 'Entropy',
 'OffCenteredEntropy']

In [22]:
all_rankings = []
for result, dataset_name in zip(all_results, all_results_paths):
    ranking = get_ranks_for_experiment(result, LoopMetric.BAC_from_predict.name)
    print(dataset_name)
    ranking = [int(rank) for rank in ranking]
    print(*ranking, sep=" & ", end=" \\\\ \n")
    all_rankings.append(ranking)

all_rankings = torch.tensor(all_rankings, dtype=float).mean(axis=0).tolist()
print(*all_rankings, sep=" & ", end=" \\\\ \n")

./mnist_final.bin
3 & 1 & 4 & 2 & 9 & 7 & 8 & 5 & 6 \\ 
./firefighters_final.bin
1 & 2 & 3 & 4 & 8 & 6 & 9 & 5 & 7 \\ 
./covertype_final.bin
3 & 1 & 2 & 5 & 9 & 8 & 7 & 4 & 6 \\ 
./satimage_final.bin
2 & 3 & 4 & 1 & 8 & 6 & 9 & 5 & 7 \\ 
./letter_final.bin
3 & 6 & 4 & 5 & 1 & 9 & 8 & 2 & 7 \\ 
./sod_final.bin
4 & 2 & 9 & 1 & 7 & 3 & 6 & 8 & 5 \\ 
2.6666666666666665 & 2.5 & 4.333333333333333 & 3.0 & 7.0 & 6.5 & 7.833333333333333 & 4.833333333333333 & 6.333333333333333 \\ 
