In [1]:
#!/usr/bin/env python
# coding: utf-8

# Restart the kernel to ensure a clean environment
import os
import IPython
IPython.Application.instance().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

In [1]:
# Install required packages
!pip install transformers==4.48.0 accelerate==0.26.0 peft==0.7.1 datasets==2.14.5 evaluate==0.4.1 scikit-learn wandb ipywidgets matplotlib --upgrade -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
# Import necessary libraries
import torch
import random
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, set_seed
from datasets import load_dataset
import wandb
import time

# Set seed for reproducibility
set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print("✅ Environment ready for Prefix Tuning!")

# Load the labeled version of PubMedQA
dataset = load_dataset("pubmed_qa", "pqa_labeled")

# Create an 80/20 train-validation split
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset["validation"] = dataset.pop("test")

# Reformat each example for BERT input
def format_for_bert(example):
    example["text"] = f"Question: {example['question']} Context: {example['context']}"
    label_map = {"yes": 0, "no": 1, "maybe": 2}
    example["label"] = label_map[example["final_decision"]]
    return example

dataset = dataset.map(format_for_bert)
print("✅ Dataset formatted for BERT classification")

# Load the BERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the input text - with reduced max_length to make room for prefix tokens
def tokenize_for_bert(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=492,  # Reduced to make room for prefix tokens (512 - 20 = 492)
        return_token_type_ids=True  # Important for BERT
    )

# Apply to both train and validation sets
tokenized_dataset = dataset.map(
    tokenize_for_bert,
    batched=True,
    remove_columns=["pubid", "question", "context", "long_answer", "final_decision", "text"]
)

print("✅ Tokenization complete")

# Set up Prefix Tuning
from peft import get_peft_model, PrefixTuningConfig, TaskType
from transformers import AutoModelForSequenceClassification
import wandb
import time
import torch

# Initialize wandb
if not wandb.run:
    wandb.init(
        project="peft-pubmedqa",
        name=f"bert-prefix-tuning-{time.strftime('%Y%m%d-%H%M%S')}",
        tags=["bert", "prefix-tuning", "peft"],
        config={
            "model": "bert-base-uncased",
            "method": "prefix-tuning",
            "strategy": "peft",
            "prefix_length": 20,
            "dataset": "pubmedqa",
            "seed": 42,
            "max_input_length": 492  # Note the reduced input length
        }
    )

# Load the base BERT model for classification
print("🔄 Loading base model...")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define Prefix Tuning configuration
prefix_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=20,  # Length of prefix
    prefix_projection=True,  # Use MLP for prefix projection
    encoder_hidden_size=768  # Hidden size for BERT
)

# Wrap the model with Prefix Tuning
print("➕ Applying Prefix Tuning...")
model = get_peft_model(model, prefix_config)
model.print_trainable_parameters()

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"🚀 Model moved to {device}")

# Set up training
from transformers import TrainingArguments, Trainer
from datasets import load_metric
import os

# Create output directory
output_dir = "./results/bert-prefix-tuning"
os.makedirs(output_dir, exist_ok=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    report_to="wandb",
    logging_steps=50,
    push_to_hub=False,
    seed=42,
    run_name=f"bert-prefix-run-{time.strftime('%Y%m%d-%H%M%S')}"  # Added unique run_name
)

# Load evaluation metrics
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

# Metric computation function
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")  # For multiclass
    
    return {
        "accuracy": acc,
        "f1": f1
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Start training
print("⏱️ Starting training...")
trainer.train()

# Evaluate the model
print("📊 Evaluating the Prefix Tuning model on validation set...")
eval_results = trainer.evaluate()
print(f"\n✅ Evaluation Results: {eval_results}")

# Manual prediction and detailed analysis
print("\n🔍 Running manual predictions...")
model.eval()
all_preds = []
all_labels = []

# Process validation dataset in batches
batch_size = 16
num_examples = len(tokenized_dataset["validation"])

with torch.no_grad():
    for i in range(0, num_examples, batch_size):
        end_idx = min(i + batch_size, num_examples)
        batch_data = tokenized_dataset["validation"][i:end_idx]
        
        # Convert to appropriate format
        input_ids = torch.tensor(batch_data["input_ids"]).to(device)
        attention_mask = torch.tensor(batch_data["attention_mask"]).to(device)
        token_type_ids = torch.tensor(batch_data["token_type_ids"]).to(device)
        labels = torch.tensor(batch_data["label"]).to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Get predictions
        predictions = torch.argmax(outputs.logits, dim=-1)
        
        # Store results
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert numeric predictions back to text labels
label_map_reverse = {0: "yes", 1: "no", 2: "maybe"}
pred_texts = [label_map_reverse[p] for p in all_preds]
label_texts = [label_map_reverse[l] for l in all_labels]

# Calculate accuracy
accuracy = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_preds)
print(f"\n📈 Manual Accuracy: {accuracy:.4f}")

# Calculate class-specific metrics
from sklearn.metrics import classification_report
print("\n📋 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["yes", "no", "maybe"]))

# Show sample predictions
print("\n🔎 Sample Predictions:")
for i in range(min(10, len(pred_texts))):
    match = "✓" if pred_texts[i] == label_texts[i] else "✗"
    print(f"Example {i+1}: Prediction: {pred_texts[i]}, True label: {label_texts[i]} {match}")

# Save the model
model.save_pretrained(f"{output_dir}/final_model")
print(f"💾 Prefix Tuning model saved to {output_dir}/final_model")

✅ Environment ready for Prefix Tuning!
✅ Dataset formatted for BERT classification


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

✅ Tokenization complete


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchandantroughia[0m ([33mchandantroughia-cst[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


🔄 Loading base model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


➕ Applying Prefix Tuning...
trainable params: 14,782,467 || all params: 124,267,014 || trainable%: 11.895728821487575
🚀 Model moved to cuda


  accuracy_metric = load_metric("accuracy")


⏱️ Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9527,0.972476,0.515,0.226623
2,0.9504,0.980366,0.515,0.226623
3,0.9826,0.988265,0.515,0.226623


📊 Evaluating the Prefix Tuning model on validation set...



✅ Evaluation Results: {'eval_loss': 0.9882653951644897, 'eval_accuracy': 0.515, 'eval_f1': 0.22662266226622663, 'eval_runtime': 0.899, 'eval_samples_per_second': 222.473, 'eval_steps_per_second': 27.809, 'epoch': 3.0}

🔍 Running manual predictions...

📈 Manual Accuracy: 0.5150

📋 Classification Report:
              precision    recall  f1-score   support

         yes       0.52      1.00      0.68       103
          no       0.00      0.00      0.00        72
       maybe       0.00      0.00      0.00        25

    accuracy                           0.52       200
   macro avg       0.17      0.33      0.23       200
weighted avg       0.27      0.52      0.35       200


🔎 Sample Predictions:
Example 1: Prediction: yes, True label: yes ✓
Example 2: Prediction: yes, True label: no ✗
Example 3: Prediction: yes, True label: yes ✓
Example 4: Prediction: yes, True label: maybe ✗
Example 5: Prediction: yes, True label: maybe ✗
Example 6: Prediction: yes, True label: no ✗
Example 7: Pr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
