### Description:
    This script implements Active Learning with **Uncertainty Sampling** using PyTorch and Hugging Face's transformers.
    Instead of randomly acquiring new labeled samples, it selects those where the model is **least confident** (least certain about its prediction).
    The model is iteratively trained and fine-tuned on newly acquired samples.
    The results (validation accuracy vs. labeled samples) are plotted.

In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader, Subset
from tqdm.auto import tqdm
import numpy as np
import random
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [3]:
model_name = "distilbert-base-uncased"
num_labels = 2  # Binary classification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [4]:
train_dataset = load_dataset("glue", "sst2", split="train")
val_dataset = load_dataset("glue", "sst2", split="validation")

In [5]:
def collate_fn(batch):
    texts = [item["sentence"] for item in batch]
    labels = [item["label"] for item in batch]
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs["labels"] = torch.tensor(labels)
    return tokenized_inputs


### Active Learning:

In [6]:
initial_label_size = 50
acquisition_size = 50
max_iterations = 10

In [7]:
all_indices = list(range(len(train_dataset)))
random.shuffle(all_indices)

labeled_indices = all_indices[:initial_label_size]
unlabeled_indices = all_indices[initial_label_size:]

labeled_sizes = []
val_accuracies = []

#### Active Learning Loop:

In [None]:
for iteration in range(max_iterations):
    print(f"\n=== Active Learning Iteration {iteration+1} ===")
    
    labeled_subset = Subset(train_dataset, labeled_indices)
    labeled_loader = DataLoader(labeled_subset, batch_size=16, shuffle=True, collate_fn=collate_fn)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    epochs = 5
    for epoch in range(epochs):
        epoch_loss = 0
        for batch in tqdm(labeled_loader, desc=f"Training Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {epoch_loss/len(labeled_loader):.4f}")

    val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    print(f"Validation Accuracy: {accuracy*100:.2f}%")

    labeled_sizes.append(len(labeled_indices))
    val_accuracies.append(accuracy)

    # ---------------------------------
    # Acquisition: Uncertainty Sampling
    # ---------------------------------

    if len(unlabeled_indices) == 0:
        break

    unlabeled_subset = Subset(train_dataset, unlabeled_indices)
    unlabeled_loader = DataLoader(unlabeled_subset, batch_size=32, collate_fn=collate_fn)

    uncertainties = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(unlabeled_loader, desc="Computing Uncertainties"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            max_probs, _ = torch.max(probabilities, dim=1)
            uncertainties.extend(1 - max_probs.cpu().numpy())  # Lower max probability = Higher uncertainty

    # Rank Unlabeled Samples by Uncertainty
    uncertain_indices = np.argsort(uncertainties)[-acquisition_size:]  # Select most uncertain
    new_indices = [unlabeled_indices[i] for i in uncertain_indices]

    labeled_indices.extend(new_indices)
    unlabeled_indices = list(set(unlabeled_indices) - set(new_indices))

    print(f"Total labeled samples after acquisition: {len(labeled_indices)}")


=== Active Learning Iteration 1 ===


Training Epoch 1: 100%|██████████| 4/4 [00:06<00:00,  1.64s/it]


Epoch 1 Loss: 0.7038


Training Epoch 2: 100%|██████████| 4/4 [00:06<00:00,  1.64s/it]


Epoch 2 Loss: 0.7091


Training Epoch 3: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Epoch 3 Loss: 0.6287


Training Epoch 4: 100%|██████████| 4/4 [00:05<00:00,  1.50s/it]


Epoch 4 Loss: 0.6260


Training Epoch 5: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


Epoch 5 Loss: 0.6080


Evaluating: 100%|██████████| 28/28 [00:24<00:00,  1.13it/s]


Validation Accuracy: 50.92%


Computing Uncertainties:  70%|██████▉   | 1468/2104 [22:07<20:44,  1.96s/it]

In [None]:
plt.figure(figsize=(8,6))
plt.plot(labeled_sizes, [acc*100 for acc in val_accuracies], marker='o', linestyle='-')
plt.xlabel("Number of Labeled Samples")
plt.ylabel("Validation Accuracy (%)")
plt.title("Active Learning Performance (Uncertainty Sampling)")
plt.grid(True)
plt.show()