In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from nn.models import OneHiddenMLP
from nn.training import train_passive, TrainConfig
from nn.evaluation import evaluate_regression
from nn.experiments import ActiveConfig, run_active_regression
from nn.strategies import uncertainty_sampling, sensitivity_sampling, UncertaintySamplingConfig

from typing import Dict

SAVE_DIR = os.path.join('..', 'report', 'figures')
DATA_DIR = os.path.join('..', 'data')
os.makedirs(SAVE_DIR, exist_ok=True)

with open(os.path.join(DATA_DIR, 'reg_sensitivity_results.json'), 'r') as f:
    sen_results = json.load(f)
with open(os.path.join(DATA_DIR, 'passive_reg_best.json'), 'r') as f:
    pas_results = json.load(f)

DATASETS = ['diabetes', 'wine_quality', 'california']
METRICS = ['rmse', 'mae', 'r2']
UNCERTAINTY_METHODS = ['entropy', 'margin', 'least_confidence']
ACTIVE_PARAMS = {
    'diabetes': {'init': 20, 'query': 10, 'budget': 150},
    'wine_quality': {'init': 30, 'query': 15, 'budget': 300},
    'california': {'init': 50, 'query': 20, 'budget': 1000}
}

In [4]:
def get_data_splits_regression(dataset: str):
    # Load data
    if dataset == "diabetes":
        ds = datasets.load_diabetes()
        y = ds.target.astype(np.float32)
    elif dataset == "wine_quality":
        from sklearn.datasets import fetch_openml
        ds = fetch_openml('wine-quality-red', version=1, as_frame=False, parser='auto')
        X = ds.data.astype(np.float32)
        y = ds.target.astype(np.float32)
    elif dataset == "california":
        ds = datasets.fetch_california_housing()
        y = ds.target.astype(np.float32)
        X = ds.data.astype(np.float32)
    
    # Only set X here if it wasn't already set above
    if dataset != "wine_quality":
        X = ds.data.astype(np.float32)
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    return X_train_val, X_test, y_train_val, y_test

def evaluate_passive_test_regression(dataset: str) -> Dict[str, float]:
    # Get best hyperparameters from tuning results
    best_cfg = pas_results[dataset]['best_cfg']
    lr = best_cfg['lr']
    wd = best_cfg['wd']
    hidden = best_cfg['hidden']
    bs = best_cfg['bs']
    
    # Get data splits
    X_train_val, X_test, y_train_val, y_test = get_data_splits_regression(dataset)
    
    all_metrics = []

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_val, dtype=torch.float32).unsqueeze(-1)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(-1)
    
    # Create datasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)
    
    # Train model
    model = OneHiddenMLP(input_dim=X_train_scaled.shape[1], hidden_units=hidden, output_dim=1)
    loss_fn = nn.MSELoss()
    config = TrainConfig(learning_rate=lr, weight_decay=wd, batch_size=bs, max_epochs=200, patience=20)
    
    train_passive(model, train_loader, test_loader, loss_fn, config)
    
    # Evaluate on test set
    metrics = evaluate_regression(model, test_loader)
    
    return metrics

def evaluate_active_test_regression(dataset: str, strategy: str, method: str, budget: int) -> Dict[str, float]:
    dataset_params = ACTIVE_PARAMS[dataset]
    init = dataset_params['init']
    query = dataset_params['query']
    hidden, bs = 64, 64
    if strategy == 'uncertainty':
        lr = 0.0003
        wd = 0.0001
    else:
        best_cfg = sen_results[dataset]['best_cfg']['train_config']
        lr = best_cfg['learning_rate']
        wd = best_cfg['weight_decay']

    # Get data splits
    X_train_val, X_test, y_train_val, y_test = get_data_splits_regression(dataset)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_val, dtype=torch.float32).unsqueeze(-1)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(-1)
    
    # Create datasets
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)
    
    # Simulate active learning on the train+val set
    train_config = TrainConfig(learning_rate=lr, weight_decay=wd, batch_size=bs, 
                                max_epochs=200, patience=20)
    
    # Create initial labeled pool
    num_train = X_train_scaled.shape[0]
    labeled_indices = torch.randperm(num_train)[:init]
    unlabeled_indices = torch.tensor([i for i in range(num_train) if i not in labeled_indices.tolist()], dtype=torch.long)
    
    x_pool = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_pool = y_train_tensor.clone()
    
    # Active learning loop
    while labeled_indices.numel() < min(budget, num_train):
        # Train model on current labeled set
        train_subset = TensorDataset(x_pool[labeled_indices], y_pool[labeled_indices])
        
        train_loader = DataLoader(train_subset, batch_size=bs, shuffle=True)
        
        model = OneHiddenMLP(input_dim=X_train_scaled.shape[1], hidden_units=hidden, output_dim=1)
        loss_fn = nn.MSELoss()
        
        train_passive(model, train_loader, test_loader, loss_fn, train_config)
        
        if unlabeled_indices.numel() == 0:
            break
        
        # Query selection
        if strategy == 'uncertainty':
            sel = uncertainty_sampling(
                model,
                x_pool[unlabeled_indices].to('cpu'),
                query,
                UncertaintySamplingConfig(mode="regression", method=method),
            )
        elif strategy == 'sensitivity':
            sel = sensitivity_sampling(model, x_pool[unlabeled_indices].to('cpu'), query)
        
        # Update labeled and unlabeled sets
        newly_selected = unlabeled_indices[sel]
        labeled_indices = torch.unique(torch.cat([labeled_indices, newly_selected]))
        mask = torch.ones_like(unlabeled_indices, dtype=torch.bool)
        mask[sel] = False
        unlabeled_indices = unlabeled_indices[mask]
        
        if labeled_indices.numel() >= budget:
            break
    
    # Final evaluation on test set
    final_train_subset = TensorDataset(x_pool[labeled_indices], y_pool[labeled_indices])
    final_train_loader = DataLoader(final_train_subset, batch_size=bs, shuffle=True)
    final_test_loader = DataLoader(test_dataset, batch_size=bs, shuffle=False)
    
    final_model = OneHiddenMLP(input_dim=X_train_scaled.shape[1], hidden_units=hidden, output_dim=1)
    loss_fn = nn.MSELoss()
    
    train_passive(final_model, final_train_loader, final_test_loader, loss_fn, train_config)
    
    # Evaluate on test set
    metrics = evaluate_regression(final_model, final_test_loader)
    
    return metrics


In [5]:
test_results = {
    'passive': {},
    'uncertainty': {},
    'sensitivity': {}
}

# Evaluate passive learning on test set
for dataset in DATASETS:
    print(f"Running passive on {dataset}")
    test_results['passive'][dataset] = evaluate_passive_test_regression(dataset)

# Evaluate uncertainty-based active learning on test set
for dataset in DATASETS:
    budget = ACTIVE_PARAMS[dataset]['budget']
    test_results['uncertainty'][dataset] = {}
    for method in UNCERTAINTY_METHODS:
        print(f"Running uncertainty on {dataset} - {method}")
        test_results['uncertainty'][dataset][method] = {}
        test_results['uncertainty'][dataset][method][str(budget)] = evaluate_active_test_regression(dataset, 'uncertainty', method, budget)

# Evaluate sensitivity-based active learning on test set
for dataset in DATASETS:
    budget = ACTIVE_PARAMS[dataset]['budget']
    print(f"Running sensitivity on {dataset}...")
    test_results['sensitivity'][dataset] = {}
    test_results['sensitivity'][dataset][str(budget)] = evaluate_active_test_regression(dataset, 'sensitivity', '', budget)

print("Done")

Running passive on diabetes
Running passive on wine_quality
Running passive on california
Running uncertainty on diabetes - entropy
Running uncertainty on diabetes - margin
Running uncertainty on diabetes - least_confidence
Running uncertainty on wine_quality - entropy
Running uncertainty on wine_quality - margin
Running uncertainty on wine_quality - least_confidence
Running uncertainty on california - entropy
Running uncertainty on california - margin
Running uncertainty on california - least_confidence
Running sensitivity on diabetes...
Running sensitivity on wine_quality...
Running sensitivity on california...
Done


In [7]:
summary_data = []

for dataset in DATASETS:

    budget = ACTIVE_PARAMS[dataset]['budget']
    # Passive learning
    summary_data.append({
        'dataset': dataset,
        'method': 'passive',
        'budget': budget,
        'rmse': test_results['passive'][dataset]['rmse'],
        'mae': test_results['passive'][dataset]['mae'],
        'r2': test_results['passive'][dataset]['r2'],
    })
    
    # Uncertainty methods
    for method in UNCERTAINTY_METHODS:
        max_budget = str(budget)
        summary_data.append({
            'dataset': dataset,
            'method': f'uncertainty_{method}',
            'budget': budget,
            'rmse': test_results['uncertainty'][dataset][method][max_budget]['rmse'],
            'mae': test_results['uncertainty'][dataset][method][max_budget]['mae'],
            'r2': test_results['uncertainty'][dataset][method][max_budget]['r2'],
        })
    
    # Sensitivity method
    max_budget = str(budget)
    summary_data.append({
        'dataset': dataset,
        'method': 'sensitivity',
        'budget': budget,
        'rmse': test_results['sensitivity'][dataset][max_budget]['rmse'],
        'mae': test_results['sensitivity'][dataset][max_budget]['mae'],
        'r2': test_results['sensitivity'][dataset][max_budget]['r2'],
    })

# Convert to DataFrame for nice display
df = pd.DataFrame(summary_data)
print(df.round(4))

# Save summary
df.to_csv(os.path.join(SAVE_DIR, 'reg_comparison_summary_test.csv'), index=False)

         dataset                        method  budget      rmse       mae  \
0       diabetes                       passive     150   51.3046   40.7117   
1       diabetes           uncertainty_entropy     150  151.3289  133.6237   
2       diabetes            uncertainty_margin     150  153.4836  134.9409   
3       diabetes  uncertainty_least_confidence     150  153.0798  135.4649   
4       diabetes                   sensitivity     150  153.3196  137.1308   
5   wine_quality                       passive     300    0.6021    0.4754   
6   wine_quality           uncertainty_entropy     300    1.7657    1.3952   
7   wine_quality            uncertainty_margin     300    2.1148    1.7154   
8   wine_quality  uncertainty_least_confidence     300    2.0495    1.6283   
9   wine_quality                   sensitivity     300    1.2974    1.0186   
10    california                       passive    1000    0.5372    0.3632   
11    california           uncertainty_entropy    1000    0.7750