## LORA Fine tuning

### Setup

In [None]:
pip install transformers peft datasets accelerate bitsandbytes

In [3]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
import numpy as np
import evaluate

### Load data

In [4]:
DATA_PATH = "PubMedQA_artificial_RAG.csv"
df = pd.read_csv(DATA_PATH)

# Construct prompt-response pairs
df["text"] = (
    "Question: " + df["question"].fillna("") +
    "\nContext: " + df["context"].fillna("") +
    "\nAnswer: " + df["long_answer"].fillna("")
)

dataset = Dataset.from_pandas(df[["text"]])
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"Train samples: {len(train_dataset)} | Eval samples: {len(eval_dataset)}")

Train samples: 190142 | Eval samples: 21127


### Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")


if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_datasets = split_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map: 100%|██████████| 190142/190142 [07:21<00:00, 430.79 examples/s]
Map: 100%|██████████| 21127/21127 [00:50<00:00, 422.54 examples/s]


### Configuration

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM = autoregressive, not masked LM
)

model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.resize_token_embeddings(len(tokenizer))

print("\nTrainable parameters:")
model.print_trainable_parameters()


Trainable parameters:
trainable params: 786,432 || all params: 347,549,696 || trainable%: 0.2263


### Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./biogpt_lora_results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
)

### Trainer

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


### Save model

### Validation and Test loss

In [12]:
print("\n Starting LoRA fine-tuning on BioGPT...")
trainer.train()



 Starting LoRA fine-tuning on BioGPT...


Epoch,Training Loss,Validation Loss
1,1.5053,1.452632
2,1.4896,1.443893
3,1.4879,1.440478


TrainOutput(global_step=35652, training_loss=1.511527681516555, metrics={'train_runtime': 16381.4634, 'train_samples_per_second': 34.821, 'train_steps_per_second': 2.176, 'total_flos': 5.3212010905809715e+17, 'train_loss': 1.511527681516555, 'epoch': 3.0})

In [13]:
print("\n Evaluating model...")
results = trainer.evaluate()
print("Evaluation Results:", results)



 Evaluating model...


Evaluation Results: {'eval_loss': 1.4404778480529785, 'eval_runtime': 256.0942, 'eval_samples_per_second': 82.497, 'eval_steps_per_second': 41.25, 'epoch': 3.0}


In [14]:
model.save_pretrained("./biogpt_lora_finetuned")
tokenizer.save_pretrained("./biogpt_lora_finetuned")

print("\n LoRA-tuned BioGPT saved to ./biogpt_lora_finetuned")


 LoRA-tuned BioGPT saved to ./biogpt_lora_finetuned
