# Unlocking the Power of Active Learning: A Hands-on Exploration

Fabian Kovac [\<fabian.kovac@fhstp.ac.at\>](mailto:fabian.kovac@fhstp.ac.at)

Oliver Eigner[<oliver.eigner@fhstp.ac.at\>](mailto:oliver.eigner@fhstp.ac.at)

---
## Imports

In [None]:
import random
from copy import deepcopy

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets._samples_generator import make_classification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from learner import ActiveLearner

---
## Settings

In [None]:
# set SEED
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

In [None]:
# number of samples to query at each iteration
N_SAMPLES = 20

# number of active learning iterations
N_QUERIES = 100

# percentage of human labeling error
HUMAN_ERROR = 0.05

# model to test (support vector machine with radial basis function kernel)
MODEL = SVC(kernel = 'rbf', C = 13, gamma = 0.8, probability = True, random_state = SEED)

In [None]:
# matplotlib styles
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (7, 4)
plt.rcParams['image.cmap'] = 'Dark2'

---
## Helper Functions

In [None]:
# function for plotting svc decision boundaries
def plot_svc_decision_function(model, ax = None, plot_support = False):
    if ax is None:
        ax = plt.gca()
        
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    x = np.linspace(xlim[0], xlim[1])
    y = np.linspace(ylim[0], ylim[1])
    X, Y = np.meshgrid(x, y)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    P = model.decision_function(xy).reshape(X.shape)

    ax.contour(X, Y, P, colors = ['crimson'], levels = [0], alpha = 1, linestyles = ['-'])
    
    if plot_support:
        ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s = 300, lw = 1, facecolors = 'crimson', alpha = 0.2)
        ax.contour(X, Y, P, colors = ['crimson', 'crimson'], levels = [-1, 1], alpha = 0.3, linestyles = ['--', '--'])
        
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

In [None]:
# function for plotting active learning history
def plot_history(history, X_train, X_test, y_test, centralized_test_acc, show_models = False):
    # plot each model state in the active learning history
    # plotting decision boundaries is only implemented for SVC models for now
    if show_models and type(history[0]['model']) == SVC:
        for i, ret in history.items():
            model = ret['model']
            X_i = ret['X']
            y_i = ret['y']
            
            # model metrics
            train_acc = round(model.score(X_i, y_i), 3)
            test_acc = round(model.score(X_test, y_test), 3)
            
            # generate two subplots, one for the training data (active learning iteratin) and one for the test data
            fig, ax = plt.subplots(1, 2, figsize = (14, 4))
            fig.suptitle(f'Active Learning Model', fontsize = 16, fontweight = 'bold')
            plt.subplots_adjust(top = 0.8)

            ax[0].title.set_text(f'Training Iteration {i} | Acc: {train_acc}')
            ax[0].scatter(X_train[:, 0], X_train[:, 1], c = 'lightgray', alpha = 1, s = 40)
            ax[0].scatter(X_i[:, 0], X_i[:, 1], c = y_i, s = 40)
            plot_svc_decision_function(model, ax[0])

            ax[1].title.set_text(f'Test data | Acc: {test_acc}')
            ax[1].scatter(X_test[:, 0], X_test[:, 1], c = y_test, s = 40)
            plot_svc_decision_function(model, ax[1])
            
            plt.show()
    
    # linegraph of model accuracy over iterations compared to centralized model
    plt.axhline(y = centralized_test_acc, color = 'gray', linestyle = '--', label = 'Centralized Test Accuracy')
    
    test_accs = [ret['model'].score(X_test, y_test) for ret in history.values()]
    plt.plot(test_accs, label = 'Active Learning Test Accuracy')    
    
    plt.xlabel('Iteration')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

---
## Data

In [None]:
# generate data
X, y = make_classification(
    n_samples = 1000, n_features = 2, n_classes = 2,
    n_informative = 2, n_redundant = 0, n_clusters_per_class = 2, class_sep = 0.85,
    random_state = SEED)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = SEED)

# plot data
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train, s = 40);

---
## Centralized Model

In [None]:
# train a centralized model
centralized_model = deepcopy(MODEL)
centralized_model.fit(X_train, y_train)

# model metrics
train_acc = round(centralized_model.score(X_train, y_train), 3)
test_acc = round(centralized_model.score(X_test, y_test), 3)

# plot decision boundaries
fig, ax = plt.subplots(1, 2, figsize = (14, 4))
fig.suptitle(f'Centralized model', fontsize = 16, fontweight = 'bold')
plt.subplots_adjust(top = 0.8)

ax[0].title.set_text(f'Training data | Acc: {train_acc}')
ax[0].scatter(X_train[:, 0], X_train[:, 1], c = y_train, s = 40)
plot_svc_decision_function(centralized_model, ax[0])

ax[1].title.set_text(f'Test data | Acc: {test_acc}')
ax[1].scatter(X_test[:, 0], X_test[:, 1], c = y_test, s = 40)
plot_svc_decision_function(centralized_model, ax[1])

---
## Active Learning

### Random Sampling

In [None]:
# Reset seed to ensure same initial model
np.random.seed(SEED)

# Active Learning using Random Sampling
learner = ActiveLearner(
    X = X_train,
    y = y_train,
    model = deepcopy(MODEL),
    n_samples = N_SAMPLES,
    n_queries = N_QUERIES,
    strategy = 'random',
    human_error = HUMAN_ERROR
)

history = learner.fit()

plot_history(history, X_train, X_test, y_test, test_acc, show_models = False)

### Least Confidence Sampling

In [None]:
# Reset seed to ensure same initial model
np.random.seed(SEED)

# Active Learning using least confidence sampling
learner = ActiveLearner(
    X = X_train,
    y = y_train,
    model = deepcopy(MODEL),
    n_samples = N_SAMPLES,
    n_queries = N_QUERIES,
    strategy = 'least_conf',
    human_error = HUMAN_ERROR
)

history = learner.fit()

plot_history(history, X_train, X_test, y_test, test_acc, show_models = False)

### Margin Confidence Sampling

In [None]:
# Reset seed to ensure same initial model
np.random.seed(SEED)

# Active Learning using margin confidence sampling
learner = ActiveLearner(
    X = X_train,
    y = y_train,
    model = deepcopy(MODEL),
    n_samples = N_SAMPLES,
    n_queries = N_QUERIES,
    strategy = 'margin_conf',
    human_error = HUMAN_ERROR
)

history = learner.fit()

plot_history(history, X_train, X_test, y_test, test_acc, show_models = False)

### Ratio Confidence Sampling

In [None]:
# Reset seed to ensure same initial model
np.random.seed(SEED)

# Active Learning using ratio confidence sampling
learner = ActiveLearner(
    X = X_train,
    y = y_train,
    model = deepcopy(MODEL),
    n_samples = N_SAMPLES,
    n_queries = N_QUERIES,
    strategy = 'ratio_conf',
    human_error = HUMAN_ERROR
)

history = learner.fit()

plot_history(history, X_train, X_test, y_test, test_acc, show_models = False)

### Entropy Sampling

In [None]:
# Reset seed to ensure same initial model
np.random.seed(SEED)

# Active Learning using entropy sampling
learner = ActiveLearner(
    X = X_train,
    y = y_train,
    model = deepcopy(MODEL),
    n_samples = N_SAMPLES,
    n_queries = N_QUERIES,
    strategy = 'entropy',
    human_error = HUMAN_ERROR
)

history = learner.fit()

plot_history(history, X_train, X_test, y_test, test_acc, show_models = False)