<a href="https://colab.research.google.com/github/rickiepark/MLQandAI/blob/main/supplementary/q18-using-llms/05_lora/lora-mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LoRA -- 다층 퍼셉트론 예제

(LLM이 아니라) 다층 퍼셉트론을 사용해 LoRA([https://arxiv.org/abs/2106.09685](https://arxiv.org/abs/2106.09685))를 밑바닥부터 구현하여 작동 방식을 이해하기 위한 노트북입니다.

In [1]:
import time
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch


if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

## 설정과 데이터셋

In [2]:
##########################
### 설정
##########################

# 장치
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64

##########################
### MNIST 데이터셋
##########################

# transforms.ToTensor()으로 입력 이미지를 0-1 범위로 변환합니다
train_dataset = datasets.MNIST(root='data',
                               train=True,
                               transform=transforms.ToTensor(),
                               download=True)

test_dataset = datasets.MNIST(root='data',
                              train=False,
                              transform=transforms.ToTensor())


train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False)

# 데이터셋 확인
for images, labels in train_loader:
    print('이미지 배치 크기:', images.shape)
    print('레이블 배치 크기:', labels.shape)
    break

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:02<00:00, 4.53MB/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 133kB/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:01<00:00, 1.27MB/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 2.78MB/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw

이미지 배치 크기: torch.Size([64, 1, 28, 28])
레이블 배치 크기: torch.Size([64])





# (LoRA를 사용하지 않는) 다층 퍼셉트론 모델

In [3]:
##########################
### 모델
##########################

# 하이퍼파라미터
random_seed = 123
learning_rate = 0.005
num_epochs = 2

# 구조
num_features = 784
num_hidden_1 = 128
num_hidden_2 = 256
num_classes = 10


class MultilayerPerceptron(nn.Module):

    def __init__(self, num_features, num_hidden_1, num_hidden_2, num_classes):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(num_features, num_hidden_1),
            nn.ReLU(),
            nn.Linear(num_hidden_1, num_hidden_2),
            nn.ReLU(),
            nn.Linear(num_hidden_2, num_classes)
        )

    def forward(self, x):
        x = self.layers(x)
        return x


torch.manual_seed(random_seed)
model_pretrained = MultilayerPerceptron(
    num_features=num_features,
    num_hidden_1=num_hidden_1,
    num_hidden_2=num_hidden_2,
    num_classes=num_classes
)

model_pretrained.to(DEVICE)
optimizer_pretrained = torch.optim.Adam(model_pretrained.parameters(), lr=learning_rate)

In [4]:
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, targets in data_loader:
            features = features.view(-1, 28*28).to(device)
            targets = targets.to(device)
            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
        return correct_pred.float()/num_examples * 100


def train(num_epochs, model, optimizer, train_loader, device):

    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (features, targets) in enumerate(train_loader):

            features = features.view(-1, 28*28).to(device)
            targets = targets.to(device)

            # 정방향 계산과 역전파
            logits = model(features)
            loss = F.cross_entropy(logits, targets)
            optimizer.zero_grad()

            loss.backward()

            # 모델 파라미터 업데이트
            optimizer.step()

            # 로깅
            if not batch_idx % 400:
                print('에포크: %03d/%03d | 배치 %03d/%03d | 손실: %.4f'
                      % (epoch+1, num_epochs, batch_idx,
                          len(train_loader), loss))

        with torch.set_grad_enabled(False):
            print('에포크: %03d/%03d 훈련 정확도: %.2f%%' % (
                  epoch+1, num_epochs,
                  compute_accuracy(model, train_loader, device)))

        print('소요 시간: %.2f min' % ((time.time() - start_time)/60))

    print('총 훈련 시간: %.2f min' % ((time.time() - start_time)/60))

In [5]:
train(num_epochs, model_pretrained, optimizer_pretrained, train_loader, DEVICE)
print(f'테스트 정확도: {compute_accuracy(model_pretrained, test_loader, DEVICE):.2f}%')

에포크: 001/002 | 배치 000/938 | 손실: 2.2971
에포크: 001/002 | 배치 400/938 | 손실: 0.1529
에포크: 001/002 | 배치 800/938 | 손실: 0.1094
에포크: 001/002 훈련 정확도: 96.01%
소요 시간: 0.22 min
에포크: 002/002 | 배치 000/938 | 손실: 0.1192
에포크: 002/002 | 배치 400/938 | 손실: 0.0593
에포크: 002/002 | 배치 800/938 | 손실: 0.0806
에포크: 002/002 훈련 정확도: 97.23%
소요 시간: 0.43 min
총 훈련 시간: 0.43 min
테스트 정확도: 96.73%


# LoRA를 사용한 다층 퍼셉트론

## LoRA 층을 추가하여 모델 수정하기

In [6]:
class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x


class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)


# 이 코드는 LinearWithLoRA와 동등합니다.
class LinearWithLoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        combined_weight = self.linear.weight + lora.T
        return F.linear(x, combined_weight, self.linear.bias)

In [7]:
torch.manual_seed(123)

layer = nn.Linear(10, 2)
x = torch.randn((1, 10))

print("원본 출력:", layer(x))

원본 출력: tensor([[0.6639, 0.4487]], grad_fn=<AddmmBackward0>)


In [8]:
layer_lora_1 = LinearWithLoRA(layer, rank=2, alpha=4)

print("LoRA 출력:", layer_lora_1(x))

LoRA 출력: tensor([[0.6639, 0.4487]], grad_fn=<AddBackward0>)


In [9]:
layer_lora_2 = LinearWithLoRAMerged(layer, rank=2, alpha=4)
print("LoRA 출력:", layer_lora_2(x))

LoRA 출력: tensor([[0.6639, 0.4487]], grad_fn=<AddmmBackward0>)


In [10]:
model_pretrained

MultilayerPerceptron(
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
)

In [11]:
import copy

model_lora = copy.deepcopy(model_pretrained)
model_dora = copy.deepcopy(model_pretrained)

In [12]:
model_lora.layers[0] = LinearWithLoRA(model_lora.layers[0], rank=4, alpha=8)
model_lora.layers[2] = LinearWithLoRA(model_lora.layers[2], rank=4, alpha=8)
model_lora.layers[4] = LinearWithLoRA(model_lora.layers[4], rank=4, alpha=8)

model_lora.to(DEVICE)
optimizer_lora = torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
model_lora

MultilayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRA(
      (linear): Linear(in_features=784, out_features=128, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRA(
      (linear): Linear(in_features=128, out_features=256, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRA(
      (linear): Linear(in_features=256, out_features=10, bias=True)
      (lora): LoRALayer()
    )
  )
)

LoRA 층을 추가했지만 아직 훈련하지 않았습니다. 따라서 LoRA 층이 있는 모델과 그렇지 않은 모델의 예측 성능은 같습니다.

In [13]:
print(f'원본 모델의 테스트 정확도: {compute_accuracy(model_pretrained, test_loader, DEVICE):.2f}%')
print(f'LoRA 모델의 테스트 정확도: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

원본 모델의 테스트 정확도: 96.73%
LoRA 모델의 테스트 정확도: 96.73%


## LoRA 모델 훈련

In [14]:
def freeze_linear_layers(model):
    for child in model.children():
        if isinstance(child, nn.Linear):
            for param in child.parameters():
                param.requires_grad = False
        else:
            # 하위 모듈 중에서 선형 층을 재귀적으로 동결합니다.
            freeze_linear_layers(child)

In [15]:
freeze_linear_layers(model_lora)

# 선형 층이 동결되었는지 확인합니다.
for name, param in model_lora.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [16]:
optimizer_lora = torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
train(num_epochs, model_lora, optimizer_lora, train_loader, DEVICE)
print(f'미세 튜닝한 LoRA 모델의 테스트 정확도: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

에포크: 001/002 | 배치 000/938 | 손실: 0.1210
에포크: 001/002 | 배치 400/938 | 손실: 0.2014
에포크: 001/002 | 배치 800/938 | 손실: 0.1422
에포크: 001/002 훈련 정확도: 97.47%
소요 시간: 0.23 min
에포크: 002/002 | 배치 000/938 | 손실: 0.0776
에포크: 002/002 | 배치 400/938 | 손실: 0.0727
에포크: 002/002 | 배치 800/938 | 손실: 0.0063
에포크: 002/002 훈련 정확도: 97.96%
소요 시간: 0.44 min
총 훈련 시간: 0.44 min
미세 튜닝한 LoRA 모델의 테스트 정확도: 97.04%
