In [None]:
%load_ext autoreload
%autoreload 2 
import numpy as np
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

from utils import get_model_values_df, DATASET_INFO, sample_data
from baselines import labeled_data_alone

pd.options.mode.chained_assignment = None

# --- Configuration ---
seed = 42
nl = 20
nu = 1000
sim = True
np.random.seed(seed)

dataset = 'CivilComments'
subgroups = None          
n_classes = DATASET_INFO[dataset]['n_classes']
model_names = DATASET_INFO[dataset]['model_names'] 
subgroup_list = subgroups.split(',') if subgroups is not None else None

# --- Load DataFrame with model predictions ---
dataset_df = get_model_values_df(dataset, model_names)
train_dataset_df = dataset_df.sample(frac=0.5, random_state=seed)
test_dataset_df = dataset_df[~dataset_df.index.isin(train_dataset_df.index)]

# --- Sample labeled and unlabeled data from train set ---
sampled_data, sampled_labels, sampled_true_labels, sampled_data_df = sample_data(
    train_dataset_df, nl, nu, model_names, seed, n_classes
)

labeled_idxs = np.where(sampled_labels != -1)[0]
unlabeled_idxs = np.where(sampled_labels == -1)[0]

# Assign group memberships
sampled_groups = [np.array(['global'] * (nl + nu))]
test_groups = [np.array(['global'] * len(test_dataset_df))]

if subgroup_list:
    for subgroup in subgroup_list:
        sampled_groups.append(np.array(sampled_data[subgroup].values))
        test_groups.append(np.array(test_dataset_df[subgroup].values))

# Sanity checks
assert len(labeled_idxs) == nl, "Number of labeled examples does not match N_LABELED."
assert len(unlabeled_idxs) == nu, "Number of unlabeled examples does not match N_UNLABELED."

print(f"Loaded {len(train_dataset_df)} train samples and {len(test_dataset_df)} test samples")
print(f"Sampled {nl} labeled and {nu} unlabeled examples")

ModuleNotFoundError: No module named 'numpy'

In [None]:
estimation_labeled_data = (
    sampled_data[labeled_idxs],
    [d[labeled_idxs] for d in sampled_groups],
    sampled_labels[labeled_idxs]
)

estimation_unlabeled_data = (
    sampled_data[unlabeled_idxs],
    [d[unlabeled_idxs] for d in sampled_groups],
    sampled_labels[unlabeled_idxs]
)


test_data = (
    test_dataset_df[model_names].values,
    test_groups,
    test_dataset_df['label'].values
)

### Labeled data alone

In [None]:
# Compute metrics using labeled baseline
method_config = {'dataset': dataset, 'subgroups': subgroups}
metrics_df = labeled_data_alone(estimation_labeled_data, method_config)

# Add model names to the results
metrics_df['model'] = metrics_df['model_idx'].apply(lambda x: model_names[x])

# Display key metrics
print("\n=== Estimated Performance Metrics ===")
print(metrics_df[['model', 'acc', 'ece', 'auc', 'auprc']].to_string(index=False))

### SSME

In [None]:
from model import SSME_KDE 

# Compute metrics using labeled baseline
method_config = {'dataset': dataset, 'subgroups': subgroups}
ssme_metrics_df = SSME_KDE(estimation_labeled_data, estimation_unlabeled_data, method_config)

# Add model names to the results
ssme_metrics_df['model'] = ssme_metrics_df['model_idx'].apply(lambda x: model_names[x])

# Display key metrics
print("\n=== Estimated Performance Metrics ===")
print(ssme_metrics_df[['model', 'ece', 'auc', 'auprc', 'acc']].to_string(index=False))

### Ground truth

In [None]:
gt_metrics_df = labeled_data_alone(test_data, method_config)
gt_metrics_df['model'] = gt_metrics_df['model_idx'].apply(lambda x: model_names[x])

print("\n=== Ground Truth Performance Metrics ===")
print(gt_metrics_df[['model', 'ece', 'auc', 'auprc', 'acc']].to_string(index=False))