In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import sys
sys.path.append('../')

from dataloader import get_model_values_df, sample_data
from utils import DATASET_TO_N_CLASSES 

run = 46
np.random.seed(run)
n_labeled = 20
n_unlabeled = 1000
dataset = 'civilcomments'
model_alg_combinations="distilbert_CORAL,distilbert_ERM,distilbert_IRM,distilbert_ERM_seed1,distilbert_ERM_seed2,distilbert_IRM_seed1,distilbert_IRM_seed2"
model_alg_list = model_alg_combinations.split(',')

group_definition = 'global'
num_groups = 1
n_classes = DATASET_TO_N_CLASSES[dataset]

split_name = 'labeled'
dataset_df = get_model_values_df(dataset, model_alg_list, value='prob_prediction')

train_dataset_df = dataset_df.sample(frac=.5, random_state=run)
test_dataset_df = dataset_df[~dataset_df.index.isin(train_dataset_df.index)]

# Sample data according to n_labeled and n_unlabeled
sampled_data, sampled_labels, sampled_true_labels, sampled_data_df = sample_data(train_dataset_df, n_labeled, n_unlabeled, model_alg_list, run, n_classes)
labeled_idxs = np.where(sampled_labels != -1)[0]
unlabeled_idxs = np.where(sampled_labels == -1)[0]

# Reorder sampled_data_df to match the order of estimation labeled and unlabeled data
sampled_data_df = pd.concat([sampled_data_df.iloc[labeled_idxs], sampled_data_df.iloc[unlabeled_idxs]], ignore_index=True)

heldout_data = test_dataset_df[model_alg_list].values
heldout_true_labels = test_dataset_df['label'].values

sampled_groups = [np.array(['global' for _ in range(n_labeled + n_unlabeled)])]
heldout_groups = [np.array(['global' for _ in range(len(test_dataset_df))])]

# Only useful when looking at a specific demographic, drawn from the metadata columns
subgroup = None
if group_definition == 'subgroups':
    sampled_groups.append(np.array(sampled_data_df[subgroup].values))
    heldout_groups.append(np.array(test_dataset_df[subgroup].values))

assert len(labeled_idxs) == n_labeled
assert len(unlabeled_idxs) == n_unlabeled


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
train_labeled_data = (sampled_data[labeled_idxs],
                            [d[labeled_idxs] for d in sampled_groups],
                            sampled_labels[labeled_idxs])

train_unlabeled_data = (sampled_data[unlabeled_idxs],
                                [d[unlabeled_idxs] for d in sampled_groups],
                                sampled_labels[unlabeled_idxs])
    
train_labeled_data_labels = sampled_true_labels[labeled_idxs]
train_unlabeled_data_labels = sampled_true_labels[unlabeled_idxs]

y_true = np.concatenate([train_labeled_data_labels, train_unlabeled_data_labels])

test_data = (heldout_data,
                heldout_groups,
                heldout_true_labels)

print("Prior: ", np.mean(train_unlabeled_data_labels), np.mean(sampled_true_labels))

Prior:  0.126 0.12549019607843137


In [7]:
from model import SSME_KDE_binary

method_config = {'dataset': dataset, 
                 'epochs': 20, 
                 'labeled_data_weight': 10,
                 'simulate_predictions': True, 
                 'prior_type': 'learned', 
                 'use_sample_weights': True, 
                 'init': 'draw'}


class_samples, class_labels, metric_estimates = SSME_KDE_binary(train_labeled_data, train_unlabeled_data, method_config)
metric_estimates

  0%|          | 0/20 [00:00<?, ?it/s]

-575.010012167958


 55%|█████▌    | 11/20 [00:07<00:06,  1.44it/s]

-248.3697863333677


100%|██████████| 20/20 [00:13<00:00,  1.48it/s]


Estimated priors:  [0.865686274509804, 0.13431372549019607]


Unnamed: 0,auc,auprc,ece,demographic,acc,model_idx
0,0.925445,0.627788,0.08765,global,0.866667,0
1,0.954807,0.843256,0.037717,global,0.946078,1
2,0.953262,0.799798,0.064268,global,0.922549,2
3,0.957659,0.850183,0.039565,global,0.951961,3
4,0.953476,0.833346,0.027091,global,0.943137,4
5,0.956841,0.827024,0.049603,global,0.936275,5
6,0.954386,0.779653,0.058878,global,0.930392,6


#### Labeled data alone

In [5]:
from baselines import labeled_binary
method_config = {'dataset': dataset, 
                'group_definition': group_definition,
                'model_alg_combinations': model_alg_combinations,}
gt_metrics_df = labeled_binary(train_labeled_data, method_config)
gt_metrics_df

Unnamed: 0,auc,auprc,ece,demographic,acc,model_idx
0,0.916667,0.7,0.093668,global,0.9,0
1,0.972222,0.833333,0.04999,global,0.95,1
2,0.972222,0.833333,0.050792,global,0.95,2
3,0.972222,0.833333,0.052059,global,0.95,3
4,0.861111,0.642857,0.101043,global,0.9,4
5,0.916667,0.7,0.069793,global,0.95,5
6,0.944444,0.75,0.08407,global,0.9,6


#### Ground truth (large, held-out dataset)

In [6]:
from baselines import labeled_binary
method_config = {'dataset': dataset, 
                'group_definition': group_definition,
                'model_alg_combinations': model_alg_combinations,}
gt_metrics_df = labeled_binary(test_data, method_config)
gt_metrics_df

Unnamed: 0,auc,auprc,ece,demographic,acc,model_idx
0,0.861134,0.398339,0.06057,global,0.881808,0
1,0.937094,0.723234,0.061885,global,0.921514,1
2,0.917046,0.662273,0.105523,global,0.881703,2
3,0.936794,0.725662,0.062255,global,0.921574,3
4,0.940185,0.734361,0.049631,global,0.921051,4
5,0.912483,0.668148,0.097461,global,0.890792,5
6,0.918956,0.657508,0.101787,global,0.887728,6
