## Instruction finetuning

### Imports and setup

In [12]:
import os
import math
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
)
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

# Check for CUDA availability and set device appropriately
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


Using device: cuda
CUDA device: NVIDIA GeForce RTX 5070 Ti
CUDA memory: 17.1 GB


### Load and split data

In [13]:
csv_path = "PubMedQA_instruction_clean.csv"  
dataset = load_dataset("csv", data_files=csv_path)["train"]

# Split 80/10/10
train_test = dataset.train_test_split(test_size=0.2, seed=42)
val_test = train_test["test"].train_test_split(test_size=0.5, seed=42)
train_dataset = train_test["train"]
val_dataset = val_test["train"]
test_dataset = val_test["test"]

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")


Train: 168447, Val: 21056, Test: 21056


### Load BIOGPT

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

# Some BioGPT models have no pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

# Move model to device immediately after loading
model = model.to(device)
print(f"Model moved to {device}")

Model moved to cuda


### Preprocessing function

In [15]:
def format_example(instr, out):
    return f"### Instruction:\n{instr}\n\n### Response:\n{out}"

def preprocess(batch):
    texts = [
        format_example(i, o)
        for i, o in zip(batch["instruction"], batch["output"])
    ]
    model_inputs = tokenizer(
        texts, truncation=True, padding="max_length", max_length=512
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 21056/21056 [00:13<00:00, 1510.60 examples/s]


### Training Config

In [18]:
EPOCHS = 2
BATCH_SIZE = 2
GRAD_ACCUM = 4
LR = 5e-5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
SAVE_DIR = "./biogpt_instruction_finetuned"
os.makedirs(SAVE_DIR, exist_ok=True)

### Trainer setup


In [19]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
num_training_steps = len(train_loader) * EPOCHS
num_warmup_steps = int(WARMUP_RATIO * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Model is already moved to device in the previous cell
print(f"Training on {device} with {len(train_loader)} batches per epoch")

best_val_loss = float("inf")
global_step = 0

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    optimizer.zero_grad()

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for step, batch in enumerate(loop):
        # Move batch to device (GPU if available, otherwise CPU)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss / GRAD_ACCUM
        loss.backward()
        running_loss += loss.item()

        if (step + 1) % GRAD_ACCUM == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

        loop.set_postfix(loss=loss.item() * GRAD_ACCUM)

    avg_train_loss = running_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} | Train Loss: {avg_train_loss:.4f}")

    # ======================================================
    # Validation
    # ======================================================
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    val_perplexity = math.exp(avg_val_loss)
    print(f"Validation Loss: {avg_val_loss:.4f} | Perplexity: {val_perplexity:.2f}")

    # Save checkpoint if improved
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(f"{SAVE_DIR}/checkpoint_epoch_{epoch+1}")
        tokenizer.save_pretrained(f"{SAVE_DIR}/checkpoint_epoch_{epoch+1}")
        print(f"Saved checkpoint for epoch {epoch+1}")

# ==========================================================
# STEP 6: Save Final Model
# ==========================================================
model.save_pretrained(f"{SAVE_DIR}/final")
tokenizer.save_pretrained(f"{SAVE_DIR}/final")
print(f"\n Final model saved at {SAVE_DIR}/final")

Training on cuda with 84224 batches per epoch


Epoch 1: 100%|██████████| 84224/84224 [3:19:44<00:00,  7.03it/s, loss=0.86]   



Epoch 1 | Train Loss: 0.1336
Validation Loss: 0.3437 | Perplexity: 1.41
Saved checkpoint for epoch 1


Epoch 2: 100%|██████████| 84224/84224 [3:21:39<00:00,  6.96it/s, loss=0.368]  



Epoch 2 | Train Loss: 0.0828
Validation Loss: 0.3431 | Perplexity: 1.41
Saved checkpoint for epoch 2

 Final model saved at ./biogpt_instruction_finetuned/final


### Evaluation on test set

In [20]:
# ==========================================================
# Final Model Evaluation on Test Set
# ==========================================================
print("\n" + "="*50)
print("FINAL MODEL EVALUATION")
print("="*50)

test_loader = DataLoader(test_dataset, batch_size=2)
model.eval()
test_loss = 0.0
total_tokens = 0

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        test_loss += outputs.loss.item()
        
        # Count tokens for more accurate perplexity
        total_tokens += batch['input_ids'].numel()

avg_test_loss = test_loss / len(test_loader)
perplexity = math.exp(avg_test_loss)

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Test Perplexity: {perplexity:.2f}")
print(f"Total tokens evaluated: {total_tokens:,}")
print("="*50)


FINAL MODEL EVALUATION
Test Loss: 0.3453
Test Perplexity: 1.41
Total tokens evaluated: 10,780,672


### Generation

In [21]:
prompt = "### Instruction:\nWhat are the effects of aspirin on platelets?\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
print("\n Example Generation:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


 Example Generation:

# # # Instruction: What are the effects of aspirin on platelets? # # # Response: The platelets of aspirin-treated patients have decreased adhesion to fibrinogen in vitro. This decrease in platelet adhesion is associated with a reduction in platelet aggregation. These effects are associated with a reduced plasma concentration of the endothelial cell adhesion molecule ICAM-1, which suggests that aspirin may have a direct effect on the endothelial cell. # # # Response: The platelet activity of aspirin-treated patients was not different from that of healthy controls. It is suggested that aspirin may be a useful therapeutic agent for preventing thrombus formation in patients with acute coronary syndromes. # # Response. The platelets of aspirin-treated patients had a
