# Finetune with LoRA

In [1]:
# pip install transformers datasets lightning watermark

In [2]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,lightning

  from .autonotebook import tqdm as notebook_tqdm


torch       : 2.5.1
transformers: 4.46.2
datasets    : 3.1.0
lightning   : 2.5.1.post0

conda environment: py311



# 1 Loading the dataset into DataFrames

In [None]:
# Standard imports

import os
from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import pandas as pd
import torch

# Custom utilities for downloading and preparing the dataset
from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [None]:
# Ensure GPU is available before proceeding

if not torch.cuda.is_available():
    print("Please switch to a GPU machine before running this notebook.")

In [None]:
# Determine whether to download dataset or not based on existence of files

files = ("test.csv", "train.csv", "val.csv")
download = True

for f in files:
    if not os.path.exists(os.path.join("data", f)):
        download = False

if download is False:
    download_dataset()
    df = load_dataset_into_to_dataframe()
    partition_dataset(df)

In [None]:
# Load already-prepared train/val/test CSVs

df_train = pd.read_csv(os.path.join("data", "train.csv"))
df_val = pd.read_csv(os.path.join("data", "val.csv"))
df_test = pd.read_csv(os.path.join("data", "test.csv"))

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [None]:
# Load the CSVs into Hugging Face's DatasetDict format

imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": os.path.join("data", "train.csv"),
        "validation": os.path.join("data", "val.csv"),
        "test": os.path.join("data", "test.csv"),
    },
)

print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 19
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 14
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10
    })
})


**Tokenize the dataset**

In [None]:
# Load tokenizer from Hugging Face for distilBERT

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [None]:
# Tokenize the dataset for BERT-style input 

def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [None]:
# Apply tokenizer to all splits

imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

In [None]:
# Free memory by deleting original dataset
del imdb_dataset

In [None]:
# Set tensor format for PyTorch and specify usable columns
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# Disable parallel tokenizer warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [None]:
# Create custom PyTorch Dataset wrapper for DataLoader
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [None]:
# Create datasets for each split
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

# Create dataloaders for training, validation, and testing
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True, 
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)

# 4 Initializing DistilBERT

In [None]:
# Load pre-trained BERT model for sequence classification
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Freeze all layers**

In [None]:
# Freeze all parameters initially

for param in model.parameters():
    param.requires_grad = False

**Add LoRA layers**

In [None]:
model # show model architecture

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# --- Define LoRA (Low-Rank Adaptation) Modules ---
# LoRA introduces trainable low-rank matrices into frozen pretrained models to reduce memory usage and training cost.


class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        """
        Implements a low-rank decomposition of a linear transformation.
        Instead of training a full weight matrix, LoRA trains two smaller matrices:
        W_a (in_dim x rank) and W_b (rank x out_dim), where rank << in_dim, out_dim.
        alpha is a scaling factor to control the contribution of LoRA.
        """
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        # W_a is initialized randomly with a small standard deviation
        self.W_a = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        # W_b is initialized to zero
        self.W_b = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        # Applies the low-rank adaptation and scales it by alpha
        x = self.alpha * (x @ self.W_a @ self.W_b)
        return x


class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        """
        Wraps a standard linear layer with an additive LoRA module.
        The original linear layer remains frozen, and only LoRA parameters are trainable.
        """
        super().__init__()
        self.linear = linear # Frozen linear layer from pre-trained model
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        # Output = original frozen linear output + LoRA adjustment
        return self.linear(x) + self.lora(x)

In [None]:
# --- Configure LoRA injection ---

from functools import partial

# LoRA configuration
lora_r = 8 # Rank of the low-rank matrices (controls trainable parameter size)
lora_alpha = 16 # Scaling factor for LoRA output
lora_dropout = 0.05

# Flags to control which parts of the model receive LoRA injections
lora_query = True # Apply LoRA to query projection in self-attention
lora_key = False # Apply LoRA to key projection
lora_value = True # Apply LoRA to value projection
lora_projection = False # Apply LoRA to attention output projection
lora_mlp = False # Apply LoRA to feed-forward MLP layers
lora_head = False # Apply LoRA to final classifier head

layers = []

# Create a partial function to simplify LoRA module creation
assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)

# Inject LoRA into specified layers of each Transformer block in DistilBERT
for layer in model.distilbert.transformer.layer:
    if lora_query:
        layer.attention.q_lin = assign_lora(layer.attention.q_lin)
    if lora_key:
        layer.attention.k_lin = assign_lora(layer.attention.k_lin)
    if lora_value:
        layer.attention.v_lin = assign_lora(layer.attention.v_lin)
    if lora_projection:
        layer.attention.out_lin = assign_lora(layer.attention.out_lin)
    if lora_mlp:
        layer.ffn.lin1 = assign_lora(layer.ffn.lin1)
        layer.ffn.lin2 = assign_lora(layer.ffn.lin2)
if lora_head:
    model.pre_classifier = assign_lora(model.pre_classifier)
    model.classifier = assign_lora(model.classifier)

In [21]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bia

In [22]:
# Check if linear layers are frozen
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

distilbert.embeddings.word_embeddings.weight: False
distilbert.embeddings.position_embeddings.weight: False
distilbert.embeddings.LayerNorm.weight: False
distilbert.embeddings.LayerNorm.bias: False
distilbert.transformer.layer.0.attention.q_lin.linear.weight: False
distilbert.transformer.layer.0.attention.q_lin.linear.bias: False
distilbert.transformer.layer.0.attention.q_lin.lora.W_a: True
distilbert.transformer.layer.0.attention.q_lin.lora.W_b: True
distilbert.transformer.layer.0.attention.k_lin.weight: False
distilbert.transformer.layer.0.attention.k_lin.bias: False
distilbert.transformer.layer.0.attention.v_lin.linear.weight: False
distilbert.transformer.layer.0.attention.v_lin.linear.bias: False
distilbert.transformer.layer.0.attention.v_lin.lora.W_a: True
distilbert.transformer.layer.0.attention.v_lin.lora.W_b: True
distilbert.transformer.layer.0.attention.out_lin.weight: False
distilbert.transformer.layer.0.attention.out_lin.bias: False
distilbert.transformer.layer.0.sa_layer_no

In [None]:
# Utility to count trainable parameters

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print("Total number of trainable parameters:", count_parameters(model))

Total number of trainable parameters: 147456


# 5 Finetuning

**Wrap in LightningModule for Training**

In [None]:
# Wrap model in custom PyTorch Lightning module
from local_model_utilities import CustomLightningModule

lightning_model = CustomLightningModule(model)

In [None]:
# Define callbacks: save best model by validation accuracy

callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="distilbert_lora")

In [None]:
# Setup Lightning Trainer

trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
# Train the model and track elapsed time

# import time
# start = time.time()

# trainer.fit(model=lightning_model,
#             train_dataloaders=train_loader,
#             val_dataloaders=val_loader)

# end = time.time()
# elapsed = end - start
# print(f"Time elapsed {elapsed/60:.2f} min")

In [None]:
# Load the best saved model checkpoint

saved_model_path="logs/distilbert_lora/version_0/checkpoints/epoch=0-step=2917.ckpt" # trained in the full-data
# train_model_path="logs/distilbert_lora/version_1/checkpoints/epoch=0-step=2.ckpt"

In [None]:
# Evaluate model on each dataset split

train_acc = trainer.test(lightning_model, dataloaders=train_loader, ckpt_path=saved_model_path, verbose=False)
val_acc = trainer.test(lightning_model, dataloaders=val_loader, ckpt_path=saved_model_path, verbose=False)
test_acc = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path=saved_model_path, verbose=False)

You are using a CUDA device ('NVIDIA H100 PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at logs/distilbert_lora/version_0/checkpoints/epoch=0-step=2917.ckpt
/home/a14-hliu/miniconda3/envs/py311/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:362: The dirpath has changed from 'logs/my-model/version_1/checkpoints' to 'logs/distilbert_lora/version_2/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at logs/distilbert_lora/version_0/checkpoints/ep

Testing DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  3.24it/s]

Restoring states from the checkpoint path at logs/distilbert_lora/version_0/checkpoints/epoch=0-step=2917.ckpt





LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at logs/distilbert_lora/version_0/checkpoints/epoch=0-step=2917.ckpt


Testing DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 79.59it/s]


Restoring states from the checkpoint path at logs/distilbert_lora/version_0/checkpoints/epoch=0-step=2917.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at logs/distilbert_lora/version_0/checkpoints/epoch=0-step=2917.ckpt


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 65.71it/s]


In [None]:
# Print final evaluation results

print(f"Train acc: {train_acc[0]['accuracy']*100:2.2f}%")
print(f"Val acc:   {val_acc[0]['accuracy']*100:2.2f}%")
print(f"Test acc:  {test_acc[0]['accuracy']*100:2.2f}%")

Train acc: 100.00%
Val acc:   85.71%
Test acc:  80.00%


In [None]:
# Optional cleanup of model checkpoints and logs

# import shutil

# # Cleanup checkpoint files as we don't need them later
# log_dir = f"logs/my-model"
# if os.path.exists(log_dir):
#     shutil.rmtree(log_dir)