In [12]:
# import os
# import IPython

# # Restart the kernel programmatically
# IPython.Application.instance().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

In [1]:
!pip install adapters transformers datasets evaluate scikit-learn wandb -q
!pip install ipywidgets matplotlib -q
!pip install accelerate==0.26.0 --upgrade -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import transformers
import datasets
import adapters
import wandb
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed, TrainingArguments, Trainer
from adapters import AutoAdapterModel, ADAPTER_CONFIG_MAP
from datasets import load_dataset
import torch
import random
import numpy as np
import wandb
import time
import os
import evaluate

print("✅ All packages installed correctly!")

✅ All packages installed correctly!


In [3]:
from datasets import load_dataset

# Load the labeled version of PubMedQA (1,000 high-quality samples)
dataset = load_dataset("pubmed_qa", "pqa_labeled")

In [4]:
# Create an 80/20 split since no validation set is provided
dataset = dataset["train"].train_test_split(test_size=0.2)

# Optional: rename "test" split to "validation" for compatibility with Hugging Face Trainer
dataset["validation"] = dataset.pop("test")

In [5]:
# Step 1: Reformat the dataset for classification
def format_for_bert(example):
    example["text"] = f"Question: {example['question']} Context: {example['context']}"
    label_map = {"yes": 0, "no": 1, "maybe": 2}
    example["label"] = label_map[example["final_decision"]]
    return example

dataset = dataset.map(format_for_bert)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'text', 'label'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'text', 'label'],
        num_rows: 200
    })
})


In [7]:
# Step 2: Tokenize
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_for_bert(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_token_type_ids=True  # Ensure segment IDs are included
    )

tokenized_dataset = dataset.map(
    tokenize_for_bert,
    batched=True,
    remove_columns=["pubid", "question", "context", "long_answer", "final_decision", "text"]
)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

## Phase 2

✅ Phase 2: Adapter Model Setup

🔹 Goal:

Load a pre-trained t5-base model and attach a trainable Adapter module.

In [8]:
# Phase 2: Adapter Model Setup with BERT
# Set random seed for reproducibility
set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Load the dataset
dataset = load_dataset("pubmed_qa", "pqa_labeled")
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset["validation"] = dataset.pop("test")

# Use the Pfeiffer adapter configuration
adapter_config_name = "pfeiffer"
adapter_config = ADAPTER_CONFIG_MAP[adapter_config_name]

print(f"Using {adapter_config_name} adapter configuration")

Using pfeiffer adapter configuration


In [9]:
# Initialize wandb for Adapter approach
if not wandb.run:
    wandb.init(
        project="peft-pubmedqa",
        name=f"bert-adapter-{time.strftime('%Y%m%d-%H%M%S')}",
        tags=["bert", "adapter", "peft"],
        config={
            "model": "bert-base-uncased",
            "method": "adapter",
            "strategy": "peft",
            "adapter_type": "pfeiffer",
            "dataset": "pubmedqa",
            "seed": 42
        }
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchandantroughia[0m ([33mchandantroughia-cst[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
# Step 3: Load BERT with adapter support
print("🔄 Loading BERT model with adapter support...")
model = AutoAdapterModel.from_pretrained("bert-base-uncased")
model.add_classification_head("pubmedqa", num_labels=3)

# Step 4: Attach Adapter
print(f"➕ Adding 'pubmedqa_adapter' with {adapter_config_name} configuration to model...")
if "pubmedqa_adapter" in model.adapters_config.adapters:
    model.delete_adapter("pubmedqa_adapter")
model.add_adapter("pubmedqa_adapter", config=adapter_config)
model.train_adapter("pubmedqa_adapter")
model.set_active_adapters("pubmedqa_adapter")

# Parameter stats
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
efficiency = trainable_params / total_params

print(f"📦 Total model parameters: {total_params:,}")
print(f"🎯 Trainable parameters (adapter only): {trainable_params:,}")
print(f"💡 Parameter efficiency: {efficiency:.2%}")

🔄 Loading BERT model with adapter support...


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


➕ Adding 'pubmedqa_adapter' with pfeiffer configuration to model...
📦 Total model parameters: 111,592,317
🎯 Trainable parameters (adapter only): 2,110,077
💡 Parameter efficiency: 1.89%


In [11]:
# Log to wandb
wandb.log({
    "total_params": total_params,
    "trainable_params": trainable_params,
    "adapter_efficiency": efficiency
})

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"🚀 Model moved to {device}")

# Step 5: Training Setup - Modified
output_dir = f"./results/bert-adapter-{adapter_config_name}"
os.makedirs(output_dir, exist_ok=True)

print("⚙️ Configuring training parameters...")
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    report_to="wandb",
    logging_steps=50,
    push_to_hub=False,
    seed=42,
    # Remove metric_for_best_model and load_best_model_at_end 
    # to simplify the process for now
    run_name=f"bert-adapter-run-{time.strftime('%Y%m%d-%H%M%S')}"
)

# Step 6: Define metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")


🚀 Model moved to cuda
⚙️ Configuring training parameters...




Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [1]:
# Simplified compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Print some examples for debugging
    print("\nSample predictions vs labels:")
    for i in range(5):
        print(f"Prediction: {predictions[i]}, Label: {labels[i]}")
    
    # Calculate accuracy manually
    correct = (predictions == labels).sum()
    total = len(predictions)
    accuracy = float(correct) / total
    
    # Use sklearn for F1
    from sklearn.metrics import f1_score
    f1 = f1_score(labels, predictions, average='macro')
    
    print(f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
    
    return {
        "accuracy": accuracy,
        "f1": f1
    }

# Step 7: Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

print("⏱️ Starting adapter training...")
start_time = time.time()
trainer.train()
training_time = time.time() - start_time

print(f"✅ Training completed in {training_time:.2f} seconds")
wandb.log({"training_time": training_time})

# Save the adapter
model.save_adapter(f"{output_dir}/final_adapter", "pubmedqa_adapter")
print(f"💾 Adapter saved to {output_dir}/final_adapter")

NameError: name 'Trainer' is not defined

In [13]:
# Evaluate the trained model
print("📊 Evaluating the adapter model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Manual prediction and analysis
print("\nRunning manual predictions...")
model.eval()
all_preds = []
all_labels = []

# Process validation dataset in batches
batch_size = 16
num_examples = len(tokenized_dataset["validation"])

with torch.no_grad():
    for i in range(0, num_examples, batch_size):
        end_idx = min(i + batch_size, num_examples)
        batch_data = tokenized_dataset["validation"][i:end_idx]
        
        # Convert to appropriate format
        input_ids = torch.tensor(batch_data["input_ids"]).to(device)
        attention_mask = torch.tensor(batch_data["attention_mask"]).to(device)
        token_type_ids = torch.tensor(batch_data["token_type_ids"]).to(device)
        labels = torch.tensor(batch_data["label"]).to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Get predictions
        predictions = torch.argmax(outputs.logits, dim=-1)
        
        # Store results
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert numeric predictions back to text labels
label_map_reverse = {0: "yes", 1: "no", 2: "maybe"}
pred_texts = [label_map_reverse[pred] for pred in all_preds]
label_texts = [label_map_reverse[label] for label in all_labels]

# Calculate accuracy
accuracy = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_preds)
print(f"Accuracy: {accuracy:.4f}")

# Calculate class-specific metrics
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["yes", "no", "maybe"]))

# Show some examples
print("\nExample predictions:")
for i in range(min(10, len(pred_texts))):
    match = "✓" if pred_texts[i] == label_texts[i] else "✗"
    print(f"Example {i+1}: Prediction: {pred_texts[i]}, True label: {label_texts[i]} {match}")

📊 Evaluating the adapter model...


Evaluation results: {'eval_runtime': 0.9662, 'eval_samples_per_second': 206.991, 'eval_steps_per_second': 25.874, 'epoch': 3.0}

Running manual predictions...
Accuracy: 0.7900

Classification Report:
              precision    recall  f1-score   support

         yes       0.82      0.87      0.84       113
          no       0.77      0.70      0.73        67
       maybe       0.65      0.65      0.65        20

    accuracy                           0.79       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.79      0.79      0.79       200


Example predictions:
Example 1: Prediction: no, True label: no ✓
Example 2: Prediction: yes, True label: yes ✓
Example 3: Prediction: yes, True label: no ✗
Example 4: Prediction: no, True label: no ✓
Example 5: Prediction: yes, True label: yes ✓
Example 6: Prediction: maybe, True label: maybe ✓
Example 7: Prediction: yes, True label: no ✗
Example 8: Prediction: yes, True label: yes ✓
Example 9: Prediction: no, True