In [1]:
# Compare Active Learning Strategies - Classification (Test Set Evaluation)
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Import our modules
from alnn.models import OneHiddenMLP
from alnn.training import train_passive, TrainConfig
from alnn.evaluation import evaluate_classification
from alnn.experiments import ActiveConfig, run_active_classification
from alnn.strategies import uncertainty_sampling, sensitivity_sampling, UncertaintySamplingConfig
from typing import Dict

SAVE_DIR = os.path.join('..', 'report', 'figures')
DATA_DIR = os.path.join('..', 'data')
os.makedirs(SAVE_DIR, exist_ok=True)

# Load hyperparameter tuning results
try:
    with open(os.path.join(DATA_DIR, 'cls_uncertainty_results.json'), 'r') as f:
        unc_results = json.load(f)
    with open(os.path.join(DATA_DIR, 'cls_sensitivity_results.json'), 'r') as f:
        sen_results = json.load(f)
    with open(os.path.join(DATA_DIR, 'passive_cls_best.json'), 'r') as f:
        pas_results = json.load(f)
except FileNotFoundError as e:
    print(f"Results file not found: {e}")
    print("Please run the combined_run_cls.py script first to generate hyperparameter tuning results")
    exit()

DATASETS = ['iris', 'wine', 'breast_cancer']
METRICS = ['accuracy', 'f1_macro']
BUDGETS = [40, 80, 120, 160, 200]
UNCERTAINTY_METHODS = ['entropy', 'margin', 'least_confidence']
N_TRIALS = 5  # Number of random seeds for test evaluation


In [2]:
# Test Set Evaluation Functions
def get_data_splits(dataset: str):
    """Get train/validation/test splits matching the combined run scripts."""
    # Load data
    if dataset == "iris":
        ds = datasets.load_iris()
    elif dataset == "wine":
        ds = datasets.load_wine()
    elif dataset == "breast_cancer":
        ds = datasets.load_breast_cancer()
    
    X, y = ds.data, ds.target
    
    # Split into train+val (80%) and test (20%) - matching combined_run_cls.py
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    return X_train_val, X_test, y_train_val, y_test

def evaluate_passive_test(dataset: str, budget: int) -> Dict[str, float]:
    """Evaluate passive learning on test set using best hyperparameters."""
    # Get best hyperparameters from tuning results
    best_cfg = pas_results[dataset]['best_cfg']
    lr = best_cfg['lr']
    wd = best_cfg['wd']
    hidden = best_cfg['hidden']
    bs = best_cfg['bs']
    
    # Get data splits
    X_train_val, X_test, y_train_val, y_test = get_data_splits(dataset)
    
    all_metrics = []
    
    # Use all train+val data for training (matching the budget concept)
    # For passive learning, we use all available training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_val, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)
    
    # Create datasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)
    
    # Train model
    model = OneHiddenMLP(input_dim=X_train_scaled.shape[1], hidden_units=hidden, output_dim=len(np.unique(y_train_val)))
    loss_fn = nn.CrossEntropyLoss()
    config = TrainConfig(learning_rate=lr, weight_decay=wd, batch_size=bs, max_epochs=200, patience=20, device='cpu')
    
    train_passive(model, train_loader, test_loader, loss_fn, config)
    
    # Evaluate on test set
    metrics = evaluate_classification(model, test_loader, device='cpu')
    
    return metrics

def evaluate_active_test(dataset: str, strategy: str, method: str, budget: int) -> Dict[str, float]:
    """Evaluate active learning on test set using best hyperparameters."""
    best_cfg = pas_results[dataset]['best_cfg']
    lr = best_cfg['lr']
    wd = best_cfg['wd']
    hidden = best_cfg['hidden']
    bs = best_cfg['bs']
    method = ''
    init, query = 20, 10

    # Get data splits
    X_train_val, X_test, y_train_val, y_test = get_data_splits(dataset)
    
    all_metrics = []
    
        
    # Standardize features using train+val data only
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_val, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)
    
    # Simulate active learning on the train+val set
    train_config = TrainConfig(learning_rate=lr, weight_decay=wd, batch_size=bs, 
                                max_epochs=200, patience=20, device='cpu')
    
    # Create initial labeled pool
    num_train = X_train_scaled.shape[0]
    labeled_indices = torch.randperm(num_train)[:init]
    unlabeled_indices = torch.tensor([i for i in range(num_train) if i not in labeled_indices.tolist()], dtype=torch.long)
    
    x_pool = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_pool = y_train_tensor.clone()
    
    # Active learning loop
    while labeled_indices.numel() < min(budget, num_train):
        # Train model on current labeled set
        train_subset = TensorDataset(x_pool[labeled_indices], y_pool[labeled_indices])
        
        train_loader = DataLoader(train_subset, batch_size=bs, shuffle=True)
        
        model = OneHiddenMLP(input_dim=X_train_scaled.shape[1], hidden_units=hidden, output_dim=len(np.unique(y_train_val)))
        loss_fn = nn.CrossEntropyLoss()
        
        train_passive(model, train_loader, test_loader, loss_fn, train_config)
        
        if unlabeled_indices.numel() == 0:
            break
        
        # Query selection
        if strategy == 'uncertainty':
            sel = uncertainty_sampling(
                model,
                x_pool[unlabeled_indices].to(train_config.device),
                query,
                UncertaintySamplingConfig(mode="classification", method=method),
            )
        elif strategy == 'sensitivity':
            sel = sensitivity_sampling(model, x_pool[unlabeled_indices].to(train_config.device), query)
        
        # Update labeled and unlabeled sets
        newly_selected = unlabeled_indices[sel]
        labeled_indices = torch.unique(torch.cat([labeled_indices, newly_selected]))
        mask = torch.ones_like(unlabeled_indices, dtype=torch.bool)
        mask[sel] = False
        unlabeled_indices = unlabeled_indices[mask]
        
        if labeled_indices.numel() >= budget:
            break
    
    # Final evaluation on test set
    final_train_subset = TensorDataset(x_pool[labeled_indices], y_pool[labeled_indices])
    final_train_loader = DataLoader(final_train_subset, batch_size=bs, shuffle=True)
    final_test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)
    
    final_model = OneHiddenMLP(input_dim=X_train_scaled.shape[1], hidden_units=hidden, output_dim=len(np.unique(y_train_val)))
    loss_fn = nn.CrossEntropyLoss()
    
    train_passive(final_model, final_train_loader, final_test_loader, loss_fn, train_config)
    
    # Evaluate on test set
    metrics = evaluate_classification(final_model, final_test_loader, device='cpu')
    
    return metrics

print("Test set evaluation functions defined!")


Test set evaluation functions defined!


In [3]:
# Perform Test Set Evaluation
print("Starting test set evaluation...")
print("This will evaluate all methods on the held-out test sets that were never seen during hyperparameter tuning.")

# Store test results
test_results = {
    'passive': {},
    'uncertainty': {},
    'sensitivity': {}
}

# Evaluate passive learning on test set
print("\nEvaluating passive learning on test sets...")
for dataset in DATASETS:
    print(f"  {dataset}...")
    test_results['passive'][dataset] = evaluate_passive_test(dataset, max(BUDGETS))

# Evaluate uncertainty-based active learning on test set
print("\nEvaluating uncertainty-based active learning on test sets...")
for dataset in DATASETS:
    test_results['uncertainty'][dataset] = {}
    for method in UNCERTAINTY_METHODS:
        print(f"  {dataset} - {method}...")
        test_results['uncertainty'][dataset][method] = {}
        for budget in BUDGETS:
            test_results['uncertainty'][dataset][method][str(budget)] = evaluate_active_test(dataset, 'uncertainty', method, budget)

# Evaluate sensitivity-based active learning on test set
print("\nEvaluating sensitivity-based active learning on test sets...")
for dataset in DATASETS:
    print(f"  {dataset}...")
    test_results['sensitivity'][dataset] = {}
    for budget in BUDGETS:
        test_results['sensitivity'][dataset][str(budget)] = evaluate_active_test(dataset, 'sensitivity', '', budget)

print("\nTest set evaluation completed!")
print("Results stored in test_results dictionary.")


Starting test set evaluation...
This will evaluate all methods on the held-out test sets that were never seen during hyperparameter tuning.

Evaluating passive learning on test sets...
  iris...
  wine...
  breast_cancer...

Evaluating uncertainty-based active learning on test sets...
  iris - entropy...
  iris - margin...
  iris - least_confidence...
  wine - entropy...
  wine - margin...
  wine - least_confidence...
  breast_cancer - entropy...
  breast_cancer - margin...
  breast_cancer - least_confidence...

Evaluating sensitivity-based active learning on test sets...
  iris...
  wine...
  breast_cancer...

Test set evaluation completed!
Results stored in test_results dictionary.


In [5]:
# Compare all datasets using TEST SET results
for dataset in DATASETS:
    print(f"\n=== Comparing strategies for {dataset} (TEST SET) ===")
    
    for metric in METRICS:
        plt.figure(figsize=(10, 6))
        
        # Plot uncertainty methods
        for method in UNCERTAINTY_METHODS:
            results = [test_results['uncertainty'][dataset][method][str(b)][f'{metric}'] for b in BUDGETS]
            plt.plot(BUDGETS, results, marker='o', label=f'uncertainty_{method}', linewidth=2)
        
        # Plot sensitivity method
        results = [test_results['sensitivity'][dataset][str(b)][f'{metric}'] for b in BUDGETS]
        plt.plot(BUDGETS, results, marker='s', label='sensitivity', linewidth=2)
        
        # Plot passive baseline as horizontal line
        baseline = test_results['passive'][dataset][f'{metric}']
        plt.axhline(baseline, color='k', linestyle='--', label='passive_best', linewidth=2)
        
        plt.xlabel('Labeled budget (max_labels)')
        plt.ylabel(metric)
        plt.title(f'{dataset}: Strategies Comparison ({metric}) - TEST SET EVALUATION')
        plt.grid(True, alpha=0.3)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(os.path.join(SAVE_DIR, f'cls_{dataset}_comparison_{metric}_test.png'), dpi=200, bbox_inches='tight')
        plt.close()
        
        print(f"Saved {dataset} {metric} comparison (TEST SET)")



=== Comparing strategies for iris (TEST SET) ===
Saved iris accuracy comparison (TEST SET)
Saved iris f1_macro comparison (TEST SET)

=== Comparing strategies for wine (TEST SET) ===
Saved wine accuracy comparison (TEST SET)
Saved wine f1_macro comparison (TEST SET)

=== Comparing strategies for breast_cancer (TEST SET) ===
Saved breast_cancer accuracy comparison (TEST SET)
Saved breast_cancer f1_macro comparison (TEST SET)


In [6]:
# Create summary table using TEST SET results
print("\n=== Summary Table (TEST SET EVALUATION) ===")
summary_data = []

for dataset in DATASETS:
    # Passive learning
    summary_data.append({
        'dataset': dataset,
        'method': 'passive',
        'budget': max(BUDGETS),
        'accuracy': test_results['passive'][dataset]['accuracy'],
        'f1': test_results['passive'][dataset]['f1_macro'],
    })
    
    # Uncertainty methods
    for method in UNCERTAINTY_METHODS:
        max_budget = str(max(BUDGETS))
        summary_data.append({
            'dataset': dataset,
            'method': f'uncertainty_{method}',
            'budget': max(BUDGETS),
            'accuracy': test_results['uncertainty'][dataset][method][max_budget]['accuracy'],
            'f1': test_results['uncertainty'][dataset][method][max_budget]['f1_macro'],
        })
    
    # Sensitivity method
    max_budget = str(max(BUDGETS))
    summary_data.append({
        'dataset': dataset,
        'method': 'sensitivity',
        'budget': max(BUDGETS),
        'accuracy': test_results['sensitivity'][dataset][max_budget]['accuracy'],
        'f1': test_results['sensitivity'][dataset][max_budget]['f1_macro'],
    })

# Convert to DataFrame for nice display
df = pd.DataFrame(summary_data)
print(df.round(4))

# Save summary
df.to_csv(os.path.join(SAVE_DIR, 'cls_comparison_summary_test.csv'), index=False)
print(f'\nSaved TEST SET comparison summary to {SAVE_DIR}')
print('All TEST SET comparison figures and summary saved!')



=== Summary Table (TEST SET EVALUATION) ===
          dataset                        method  budget  accuracy      f1
0            iris                       passive     200    0.9667  0.9666
1            iris           uncertainty_entropy     200    0.9333  0.9333
2            iris            uncertainty_margin     200    0.9667  0.9666
3            iris  uncertainty_least_confidence     200    0.9667  0.9666
4            iris                   sensitivity     200    0.9667  0.9666
5            wine                       passive     200    0.9722  0.9710
6            wine           uncertainty_entropy     200    1.0000  1.0000
7            wine            uncertainty_margin     200    0.9722  0.9710
8            wine  uncertainty_least_confidence     200    1.0000  1.0000
9            wine                   sensitivity     200    0.9722  0.9710
10  breast_cancer                       passive     200    0.9649  0.9627
11  breast_cancer           uncertainty_entropy     200    0.9649  