In [20]:
from datasets import load_dataset, load_metric
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, set_seed
import numpy as np
import torch
from torch.utils.data import DataLoader
import pickle
from tqdm import tqdm

In [4]:
def calculate_entropy(self, prob_dist):
    """ 
    Returns raw entropy 
                
    Keyword arguments:
        prob_dist -- a pytorch tensor of real numbers between 0 and 1 that total to 1.0. e.g. tensor([0.0321, 0.6439, 0.0871, 0.2369])
    """
    log_probs = prob_dist * torch.log2(prob_dist) # multiply each probability by its base 2 log
    raw_entropy = 0 - torch.sum(log_probs)
    return raw_entropy

In [14]:
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")

def preprocess(data):
    data = data.rename_column('label', 'scalar_label')
    data = data.map(lambda x: {'label' : 0 if x['scalar_label'] < 0.5 else 1})

    def tokenize_func(examples): 
        tokenized = tokenizer(
            examples["sentence"], padding="max_length", max_length=config.max_length, truncation=True
        )
        tokenized["labels"] = examples["label"]
        return tokenized

    ds = data.map(
        tokenize_func,
        remove_columns=data.column_names,
        batched=True,
    )
    ds.set_format(type="torch")
    return ds

In [11]:
@dataclass(frozen=True)
class Config:
    max_length: int = 66
    debug: bool = False
    model_name: str = "google/electra-small-discriminator"
    strategy: str = "random_sampling"
    sampling_size: int = 1000
    max_steps: int = 10000
    batch_size: int = 8


sampling_size = 1000
config = Config(
    max_length=66, debug=False, strategy="uncertainty_sampling", sampling_size=sampling_size, max_steps=10000
)

In [16]:
############ set up data ############
set_seed(42)
sst2 = load_dataset("sst")
model = AutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=2)
original_train_ds = preprocess(sst2["train"])
original_train_dl = DataLoader(original_train_ds, batch_size=config.batch_size, shuffle=False)

No config specified, defaulting to: sst/default
Reusing dataset sst (/Users/garylai/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)
100%|██████████| 3/3 [00:00<00:00, 655.33it/s]
Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ

In [22]:
model.eval()
with torch.no_grad():
    preds = []
    for batch in tqdm(original_train_dl):
        out = model(**batch)
        preds.append(out.logits)

100%|██████████| 1068/1068 [03:50<00:00,  4.63it/s]


In [19]:
# calculate entropy for each pred in preds




# select based on the highest entropy

tensor([[0.0450, 0.0398],
        [0.0467, 0.0305],
        [0.0464, 0.0331],
        [0.0389, 0.0680],
        [0.0853, 0.0101],
        [0.0302, 0.0546],
        [0.0490, 0.0412],
        [0.0449, 0.0400]])