<a href="https://colab.research.google.com/github/rickiepark/MLQandAI/blob/main/supplementary/q18-using-llms/05_lora/lora-llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LoRA로 미세 튜닝하기

In [1]:
!pip install datasets lightning

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning
  Downloading lightning-2.5.0.post0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)


In [2]:
!pip install --quiet watermark

%load_ext watermark
%watermark -p torch,transformers,datasets,lightning

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m51.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25htorch       : 2.5.1+cu121
transformers: 4.47.1
datasets    : 3.2.0
lightning   : 2.5.0.post0



# 1 Loading the dataset into DataFrames

In [3]:
!wget https://raw.githubusercontent.com/rickiepark/MLQandAI/refs/heads/main/supplementary/q18-using-llms/04_adapter/local_dataset_utilities.py

--2025-01-17 07:15:54--  https://raw.githubusercontent.com/rickiepark/MLQandAI/refs/heads/main/supplementary/q18-using-llms/04_adapter/local_dataset_utilities.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2949 (2.9K) [text/plain]
Saving to: ‘local_dataset_utilities.py’


2025-01-17 07:15:55 (53.7 MB/s) - ‘local_dataset_utilities.py’ saved [2949/2949]



In [4]:
import os
from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import pandas as pd
import torch

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [5]:
if not torch.cuda.is_available():
    print("이 노트북을 실행하기 전에 GPU 런타임으로 바꾸세요.")

In [6]:
files = ("test.csv", "train.csv", "val.csv")
download = True

for f in files:
    if not os.path.exists(f):
        download = False

if download is False:
    download_dataset()
    df = load_dataset_into_to_dataframe()
    partition_dataset(df)

100% | 80.23 MB | 2.20 MB/s | 36.51 sec elapsed

100%|██████████| 50000/50000 [00:46<00:00, 1076.88it/s]


Class distribution:


In [8]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

# 2 토큰화

**`load_dataset`으로 데이터셋을 로드합니다**

In [13]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


**토큰화를 수행합니다**

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("토크나이저의 입력 최대 길이:", tokenizer.model_max_length)
print("토크나이저의 어휘 사전 크기:", tokenizer.vocab_size)

토크나이저의 입력 최대 길이: 512
토크나이저의 어휘 사전 크기: 30522


In [15]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [16]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [17]:
del imdb_dataset

In [18]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [19]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 데이터로더 준비하기

In [20]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [21]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)



# 4 DistilBERT 초기화

In [22]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**모든 층을 동결합니다**

In [23]:
for param in model.parameters():
    param.requires_grad = False

**LoRA 층을 추가합니다**

In [24]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [25]:
class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.W_a = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.W_b = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.W_a @ self.W_b)
        return x


class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)

In [26]:
from functools import partial

lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_query = True
lora_key = False
lora_value = True
lora_projection = False
lora_mlp = False
lora_head = False

layers = []

assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)

for layer in model.distilbert.transformer.layer:
    if lora_query:
        layer.attention.q_lin = assign_lora(layer.attention.q_lin)
    if lora_key:
        layer.attention.k_lin = assign_lora(layer.attention.k_lin)
    if lora_value:
        layer.attention.v_lin = assign_lora(layer.attention.v_lin)
    if lora_projection:
        layer.attention.out_lin = assign_lora(layer.attention.out_lin)
    if lora_mlp:
        layer.ffn.lin1 = assign_lora(layer.ffn.lin1)
        layer.ffn.lin2 = assign_lora(layer.ffn.lin2)
if lora_head:
    model.pre_classifier = assign_lora(model.pre_classifier)
    model.classifier = assign_lora(model.classifier)

In [27]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bia

In [28]:
# 선형 층이 동결되었는지 확인합니다
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad}")

distilbert.embeddings.word_embeddings.weight: False
distilbert.embeddings.position_embeddings.weight: False
distilbert.embeddings.LayerNorm.weight: False
distilbert.embeddings.LayerNorm.bias: False
distilbert.transformer.layer.0.attention.q_lin.linear.weight: False
distilbert.transformer.layer.0.attention.q_lin.linear.bias: False
distilbert.transformer.layer.0.attention.q_lin.lora.W_a: True
distilbert.transformer.layer.0.attention.q_lin.lora.W_b: True
distilbert.transformer.layer.0.attention.k_lin.weight: False
distilbert.transformer.layer.0.attention.k_lin.bias: False
distilbert.transformer.layer.0.attention.v_lin.linear.weight: False
distilbert.transformer.layer.0.attention.v_lin.linear.bias: False
distilbert.transformer.layer.0.attention.v_lin.lora.W_a: True
distilbert.transformer.layer.0.attention.v_lin.lora.W_b: True
distilbert.transformer.layer.0.attention.out_lin.weight: False
distilbert.transformer.layer.0.attention.out_lin.bias: False
distilbert.transformer.layer.0.sa_layer_no

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print("훈련 가능한 전체 파라미터 개수:", count_parameters(model))

훈련 가능한 전체 파라미터 개수: 147456


# 5 미세 튜닝

**`LightningModule`을 상속하여 훈련합니다**

In [31]:
import lightning as L
import torch
import torchmetrics


class CustomLightningModule(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

lightning_model = CustomLightningModule(model)

In [32]:
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # 최상위 1개 모델을 저장합니다
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [33]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices="auto",
    logger=logger,
    log_every_n_steps=10,
)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [34]:
import time
start = time.time()

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

end = time.time()
elapsed = end - start
print(f"소요 시간: {elapsed/60:.2f} min")

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name     | Type                                | Params | Mode 
-------------------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.1 M | eval 
1 | val_acc  | MulticlassAccuracy                  | 0      | train
2 | test_acc | MulticlassAccuracy                  | 0      | train
-------------------------------------------------------------------------
147 K     Trainable params
67.0 M    Non-trainable params
67.1 M    Total params
268.410   Total estimated model params size (MB)
26        Modules in train mode
96        Modules in eval mode
INFO:lightning.pytorch.callbacks.model_summary:
  | Name     | Type                                | Params | Mode 
-------------------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.1 M

Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=3` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


소요 시간: 17.74 min


In [35]:
train_acc = trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best", verbose=False)
val_acc = trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best", verbose=False)
test_acc = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best", verbose=False)

INFO: Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Testing: |          | 0/? [00:00<?, ?it/s]

INFO: Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

INFO: Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

In [36]:
print(f"훈련 정확도:    {train_acc[0]['accuracy']*100:2.2f}%")
print(f"검증 정확도:    {val_acc[0]['accuracy']*100:2.2f}%")
print(f"테스트 정확도:  {test_acc[0]['accuracy']*100:2.2f}%")

훈련 정확도:    92.86%
검증 정확도:    90.20%
테스트 정확도:  89.29%


In [37]:
import shutil

# 더 이상 필요하지 않은 체크포인트 파일을 삭제합니다
log_dir = f"logs/my-model"
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)