LoRA and DoRA implementations adapted from [Improving LoRA: Implementing Weight-Decomposed Low-Rank Adaptation (DoRA) from Scratch](https://www.kdnuggets.com/2024/01/improving-lora-weight-decomposed-low-rank-adaptation-dora-scratch.html). Copyright 2024 Sebastian Raschka, Apache License 2.0.

Visualization methods adapted from [FastAI v3 Lesson 2: SGD](https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson2-sgd.ipynb). Copyright 2018 FastAI, Apache License 2.0.

# Setup

In [1]:
import copy
from functools import partial
from pathlib import Path
from typing import Optional

from IPython.display import HTML
from matplotlib import animation
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F

In [2]:
mps_available = torch.backends.mps.is_available()
mps_available

True

In [3]:
def set_seeds(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seeds()

In [4]:
# Model Configuration
RANK = 4  # LoRA/DoRA rank
ALPHA = 32  # LoRA/DoRA scaling factor
NUM_HIDDEN_1 = 20
NUM_HIDDEN_2 = 20

# Training Configuration
LEARNING_RATE = 0.01
NUM_STEPS = 150

# Animation Configuration
INTERVAL = 20
OUTPUT_DIR = Path('output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Data Configuration
NUM_SAMPLES = 100
NOISE_SCALE = 0.1

In [5]:
torch.__version__

'2.5.1'

In [6]:
def get_device() -> torch.device:
    """Determine the best available device for PyTorch."""
    if torch.cuda.is_available():
        print("Using CUDA device")
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        print("Using MPS device")
        return torch.device("mps")
    else:
        print("Using CPU device")
        return torch.device("cpu")


DEVICE = get_device()
DEVICE

Using MPS device


device(type='mps')

# Generate Data

In [7]:
def generate_data(
    num_samples: int = NUM_SAMPLES,
    noise_scale: float = NOISE_SCALE,
    device: Optional[torch.device] = None,
) -> tuple[Tensor, Tensor, Tensor]:
    """Generate synthetic data for training.

    Args:
        num_samples: Number of data points to generate
        noise_scale: Scale of random noise to add
        device: Device to place tensors on

    Returns:
        x: Input tensor
        y1: First target tensor
        y2: Second target tensor
    """
    if device is None:
        device = get_device()

    try:
        x = torch.linspace(-1, 1, num_samples)[:, None].to(device)
        noise = torch.randn(num_samples)[:, None].to(device)

        y1 = x**2 + noise_scale * noise
        y2 = (
            x**3
            - 0.5 * x**2
            + 0.5
            + noise_scale * torch.randn(num_samples)[:, None].to(device)
        )

        return x, y1, y2
    except RuntimeError as e:
        print(f"Error generating data: {e}")
        raise


# Replace current data generation with:
x, y1, y2 = generate_data(device=DEVICE)

# Train Base Model

In [8]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, num_features, num_hidden_1, num_hidden_2, device=None):
        super().__init__()
        if device is None:
            device = get_device()

        self.layers = nn.Sequential(
            nn.Linear(num_features, num_hidden_1),
            nn.ReLU(),
            nn.Linear(num_hidden_1, num_hidden_2),
            nn.ReLU(),
            nn.Linear(num_hidden_2, 1),
        ).to(device)

    def forward(self, x):
        return self.layers(x)

In [9]:
model = MultilayerPerceptron(1, NUM_HIDDEN_1, NUM_HIDDEN_2).to(DEVICE)
model

Using MPS device


MultilayerPerceptron(
  (layers): Sequential(
    (0): Linear(in_features=1, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=1, bias=True)
  )
)

In [10]:
def create_training_plot(x, y1, y2, model_output):
    """Create a scatter plot of the data points and model prediction line.

    Args:
        x: Input tensor
        y1: First dataset tensor
        y2: Second dataset tensor
        model_output: Model predictions tensor

    Returns:
        fig: matplotlib figure
        line: Line artist for model predictions
    """
    x_cpu = x.cpu()
    y1_cpu = y1.cpu()
    y2_cpu = y2.cpu()
    output_cpu = model_output.cpu().detach()

    fig = plt.figure()
    plt.scatter(x_cpu, y1_cpu, label="Dataset 1")
    plt.scatter(x_cpu, y2_cpu, label="Dataset 2")
    (line,) = plt.plot(x_cpu, output_cpu, "r-", label="Model Prediction")
    plt.legend()
    return fig, line


def animate(model, y, optimizer, line, frame):
    """Animate one frame of training.

    Args:
        model: PyTorch model to train
        y: Target tensor
        optimizer: PyTorch optimizer
        line: Line artist to update
        frame: Current frame number
    """
    # FuncAnimation calls frame=0 twice at start, we want to show initial state both times
    if frame == 0:
        line.set_ydata(model(x).cpu().detach().numpy())
        print(f"Initial Loss: {nn.MSELoss()(model(x), y):.4f}")
        return (line,)

    loss = update(model, y, optimizer)
    if frame % 10 == 0:
        print(f"Iteration {frame}, Loss: {loss:.4f}")

    line.set_ydata(model(x).cpu().detach().numpy())
    return (line,)


def update(model, y, optimizer, loss_fn=nn.MSELoss()):
    """Perform one training step."""
    optimizer.zero_grad()
    try:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()
        return loss.item()
    except RuntimeError as e:
        print(f"Error during training step: {e}")
        raise


def create_training_animation(
    model, x, y, optimizer, NUM_STEPS=NUM_STEPS, interval=INTERVAL
):
    """Create an animation of the training process.

    Args:
        model: PyTorch model to train
        x: Input tensor
        y: Target tensor
        optimizer: PyTorch optimizer
        NUM_STEPS: Number of training steps to animate
        interval: Milliseconds between animation frames
    """
    fig, line = create_training_plot(x, y1, y2, model(x))
    plt.close()  # Prevent display of initial figure

    anim = animation.FuncAnimation(
        fig,
        partial(animate, model, y, optimizer, line),
        frames=NUM_STEPS,
        repeat=False,
        interval=interval,
    )

    return anim

In [11]:
anim = create_training_animation(
    model,
    x,
    y1,
    torch.optim.Adam(model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.1330
Initial Loss: 0.1330
Iteration 10, Loss: 0.0679
Iteration 20, Loss: 0.0158
Iteration 30, Loss: 0.0142
Iteration 40, Loss: 0.0109
Iteration 50, Loss: 0.0093
Iteration 60, Loss: 0.0089
Iteration 70, Loss: 0.0087
Iteration 80, Loss: 0.0086
Iteration 90, Loss: 0.0085
Iteration 100, Loss: 0.0084
Iteration 110, Loss: 0.0084
Iteration 120, Loss: 0.0083
Iteration 130, Loss: 0.0082
Iteration 140, Loss: 0.0082


# Standard Fine-Tuning

In [12]:
finetune_model = copy.deepcopy(model)

In [13]:
anim = create_training_animation(
    finetune_model,
    x,
    y2,
    torch.optim.Adam(finetune_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0699
Iteration 20, Loss: 0.0476
Iteration 30, Loss: 0.0357
Iteration 40, Loss: 0.0283
Iteration 50, Loss: 0.0235
Iteration 60, Loss: 0.0193
Iteration 70, Loss: 0.0154
Iteration 80, Loss: 0.0122
Iteration 90, Loss: 0.0102
Iteration 100, Loss: 0.0090
Iteration 110, Loss: 0.0085
Iteration 120, Loss: 0.0082
Iteration 130, Loss: 0.0079
Iteration 140, Loss: 0.0077


# LoRA Fine-Tuning

In [14]:
class LoRALayer(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, rank: int, alpha: float) -> None:
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.gamma_r = alpha / rank

    def forward(self, x: Tensor) -> Tensor:
        # https://magazine.sebastianraschka.com/p/lora-and-dora-from-scratch
        # uses alpha directly here in place of gamma_r, but using gamma_r
        # makes it simpler to compare LoRA and rsLoRA.
        return self.gamma_r * (x @ self.A @ self.B)

In [15]:
class LinearWithLoRA(nn.Module):
    def __init__(self, linear: nn.Linear, rank: int, alpha: float, lora_layer_class: nn.Module = LoRALayer) -> None:
        super().__init__()
        self.linear = linear
        self.lora = lora_layer_class(linear.in_features, linear.out_features, rank, alpha)

    def forward(self, x: Tensor) -> Tensor:
        return self.linear(x) + self.lora(x)

In [16]:
def test_lora_layer_does_not_change_initial_output():
    layer = nn.Linear(1, 2).to(DEVICE)
    original_output = layer(x[0])
    layer_lora = LinearWithLoRA(layer, rank=1, alpha=4).to(DEVICE)
    lora_output = layer_lora(x[0])
    assert (lora_output == original_output).all()


test_lora_layer_does_not_change_initial_output()

In [17]:
class LinearWithLoRAMerged(LinearWithLoRA):
    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        combined_weight = self.linear.weight + self.lora.gamma_r * lora.T
        return F.linear(x, combined_weight, self.linear.bias)

In [18]:
def test_lora_merged_layer_does_not_change_initial_output():
    layer = nn.Linear(1, 2).to(DEVICE)
    original_output = layer(x[0])
    layer_lora = LinearWithLoRAMerged(layer, rank=1, alpha=4).to(DEVICE)
    lora_output = layer_lora(x[0])
    assert (lora_output == original_output).all()


test_lora_merged_layer_does_not_change_initial_output()

In [19]:
def freeze_linear_layers(model):
    for child in model.children():
        if isinstance(child, nn.Linear):
            for param in child.parameters():
                param.requires_grad = False
        else:
            freeze_linear_layers(child)


In [20]:
def create_lora_model(
    base_model,
    lora_layer_indices,
    lora_layer_class,
):
    """Create a LoRA version of a base model.
    
    Args:
        base_model: Base model to apply LoRA to
        lora_layer_indices: Indices of the layers to apply LoRA to
        lora_layer_class: Class of the LoRA layer to use

    Returns:
        Modified model with LoRA layers
    """
    lora_model = copy.deepcopy(base_model)

    for index in lora_layer_indices:
        lora_model.layers[index] = lora_layer_class(
            lora_model.layers[index]
        ).to(DEVICE)

    freeze_linear_layers(lora_model)

    return lora_model

In [21]:
torch.manual_seed(
    678
)  # resetting seed so LoRA and LoRAMerged get the same LoRA weight initializations
lora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(LinearWithLoRA, rank=RANK, alpha=ALPHA),
)

lora_model

MultilayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRA(
      (linear): Linear(in_features=1, out_features=20, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRA(
      (linear): Linear(in_features=20, out_features=20, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRA(
      (linear): Linear(in_features=20, out_features=1, bias=True)
      (lora): LoRALayer()
    )
  )
)

In [22]:
print("Confirming LoRA model linear layers are frozen")
for name, param in lora_model.named_parameters():
    print(f"{name}: {param.requires_grad}")

Confirming LoRA model linear layers are frozen
layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [23]:
torch.manual_seed(
    678
)  # resetting seed so LoRA and LoRAMerged get the same LoRA weight initializations
lora_model_merged = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(LinearWithLoRAMerged, rank=RANK, alpha=ALPHA),
)

lora_model_merged

MultilayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRAMerged(
      (linear): Linear(in_features=1, out_features=20, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRAMerged(
      (linear): Linear(in_features=20, out_features=20, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRAMerged(
      (linear): Linear(in_features=20, out_features=1, bias=True)
      (lora): LoRALayer()
    )
  )
)

In [24]:
print("Confirming LoRA Merged model linear layers are frozen")
for name, param in lora_model_merged.named_parameters():
    print(f"{name}: {param.requires_grad}")

Confirming LoRA Merged model linear layers are frozen
layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [25]:
def test_lora_models_produce_same_output():
    with torch.no_grad():
        output1 = lora_model(x)
        output2 = lora_model_merged(x)
        diff = (output1 - output2).abs().max().item()
        assert diff < 1e-5

test_lora_models_produce_same_output()

In [26]:
anim = create_training_animation(
    lora_model,
    x,
    y2,
    torch.optim.Adam(lora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0683
Iteration 20, Loss: 0.0441
Iteration 30, Loss: 0.0228
Iteration 40, Loss: 0.0129
Iteration 50, Loss: 0.0095
Iteration 60, Loss: 0.0080
Iteration 70, Loss: 0.0076
Iteration 80, Loss: 0.0074
Iteration 90, Loss: 0.0073
Iteration 100, Loss: 0.0073
Iteration 110, Loss: 0.0072
Iteration 120, Loss: 0.0072
Iteration 130, Loss: 0.0071
Iteration 140, Loss: 0.0071


In [27]:
anim = create_training_animation(
    lora_model_merged,
    x,
    y2,
    torch.optim.Adam(lora_model_merged.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0683
Iteration 20, Loss: 0.0441
Iteration 30, Loss: 0.0228
Iteration 40, Loss: 0.0129
Iteration 50, Loss: 0.0095
Iteration 60, Loss: 0.0080
Iteration 70, Loss: 0.0076
Iteration 80, Loss: 0.0074
Iteration 90, Loss: 0.0073
Iteration 100, Loss: 0.0073
Iteration 110, Loss: 0.0072
Iteration 120, Loss: 0.0072
Iteration 130, Loss: 0.0071
Iteration 140, Loss: 0.0071


# DoRA Fine-Tuning

In [28]:
class LinearWithDoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
        self.m = nn.Parameter(self.linear.weight.norm(p=2, dim=0, keepdim=True))

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        numerator = self.linear.weight + self.lora.gamma_r * lora.T
        denominator = numerator.norm(p=2, dim=0, keepdim=True)
        directional_component = numerator / denominator
        new_weight = self.m * directional_component
        return F.linear(x, new_weight, self.linear.bias)

In [29]:
dora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(LinearWithDoRAMerged, rank=RANK, alpha=ALPHA),
)

dora_model

MultilayerPerceptron(
  (layers): Sequential(
    (0): LinearWithDoRAMerged(
      (linear): Linear(in_features=1, out_features=20, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithDoRAMerged(
      (linear): Linear(in_features=20, out_features=20, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithDoRAMerged(
      (linear): Linear(in_features=20, out_features=1, bias=True)
      (lora): LoRALayer()
    )
  )
)

In [30]:
for name, param in dora_model.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.m: True
layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.m: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.m: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [31]:
anim = create_training_animation(
    dora_model,
    x,
    y2,
    torch.optim.Adam(dora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0548
Iteration 20, Loss: 0.0379
Iteration 30, Loss: 0.0654
Iteration 40, Loss: 0.0348
Iteration 50, Loss: 0.0225
Iteration 60, Loss: 0.0139
Iteration 70, Loss: 0.0097
Iteration 80, Loss: 0.0087
Iteration 90, Loss: 0.0083
Iteration 100, Loss: 0.0078
Iteration 110, Loss: 0.0075
Iteration 120, Loss: 0.0074
Iteration 130, Loss: 0.0072
Iteration 140, Loss: 0.0071


# rsLoRA Fine-Tuning

IIUC rsLoRA is just LoRA with a different scaling factor. The [paper](https://arxiv.org/2312.03732) that introduced it shows that the scale factor $\gamma_r$ applied to the LoRA matrix $BA$ in $x_{out} = (W + \gamma_r BA)x_{in}$ needs to scale as $\frac{1}{\sqrt{r}}$ rather than $\frac{1}{r}$, or it will be unstable for sufficiently large $r$.

In [32]:
class RsLoRALayer(LoRALayer):
    def __init__(self, in_dim: int, out_dim: int, rank: int, alpha: float) -> None:
        super().__init__(in_dim, out_dim, rank, alpha)
        self.gamma_r = alpha / (rank ** 1/2)

## Reproduce LoRA results

In [33]:
torch.manual_seed(
    678
)  # resetting seed so we get the same LoRA weight initializations as before
rslora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(
        LinearWithLoRA,
        rank=RANK,
        # will give same gamma_r as LoRA above, so should train the same
        alpha=ALPHA / (RANK ** 1/2),
        lora_layer_class=RsLoRALayer,
    ),
)
rslora_model

MultilayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRA(
      (linear): Linear(in_features=1, out_features=20, bias=True)
      (lora): RsLoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRA(
      (linear): Linear(in_features=20, out_features=20, bias=True)
      (lora): RsLoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRA(
      (linear): Linear(in_features=20, out_features=1, bias=True)
      (lora): RsLoRALayer()
    )
  )
)

In [34]:
for name, param in rslora_model.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [35]:
anim = create_training_animation(
    rslora_model,
    x,
    y2,
    torch.optim.Adam(rslora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0683
Iteration 20, Loss: 0.0441
Iteration 30, Loss: 0.0228
Iteration 40, Loss: 0.0129
Iteration 50, Loss: 0.0095
Iteration 60, Loss: 0.0080
Iteration 70, Loss: 0.0076
Iteration 80, Loss: 0.0074
Iteration 90, Loss: 0.0073
Iteration 100, Loss: 0.0073
Iteration 110, Loss: 0.0072
Iteration 120, Loss: 0.0072
Iteration 130, Loss: 0.0071
Iteration 140, Loss: 0.0071


## Compare LoRA and rsLoRA at Extreme Ranks

I don't know that we'll see better stability across ranks for rsLoRA given our simple base model, but let's try it out.

In [36]:
lora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(LinearWithLoRAMerged, rank=1, alpha=ALPHA),
)
anim = create_training_animation(
    lora_model,
    x,
    y2,
    torch.optim.Adam(lora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.1630
Iteration 20, Loss: 0.0863
Iteration 30, Loss: 0.0582
Iteration 40, Loss: 0.0450
Iteration 50, Loss: 0.0378
Iteration 60, Loss: 0.0342
Iteration 70, Loss: 0.0333
Iteration 80, Loss: 0.0321
Iteration 90, Loss: 0.0312
Iteration 100, Loss: 0.0284
Iteration 110, Loss: 0.0272
Iteration 120, Loss: 0.0258
Iteration 130, Loss: 0.0248
Iteration 140, Loss: 0.0244


In [37]:
lora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(LinearWithLoRAMerged, rank=20, alpha=ALPHA),
)
anim = create_training_animation(
    lora_model,
    x,
    y2,
    torch.optim.Adam(lora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0492
Iteration 20, Loss: 0.0403
Iteration 30, Loss: 0.0235
Iteration 40, Loss: 0.0140
Iteration 50, Loss: 0.0091
Iteration 60, Loss: 0.0089
Iteration 70, Loss: 0.0104
Iteration 80, Loss: 0.0072
Iteration 90, Loss: 0.0075
Iteration 100, Loss: 0.0070
Iteration 110, Loss: 0.0070
Iteration 120, Loss: 0.0070
Iteration 130, Loss: 0.0081
Iteration 140, Loss: 0.0074


In [38]:
rslora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(
        LinearWithLoRA,
        rank=1,
        alpha=ALPHA,
        lora_layer_class=RsLoRALayer,
    ),
)
anim = create_training_animation(
    rslora_model,
    x,
    y2,
    torch.optim.Adam(rslora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.1289
Iteration 20, Loss: 0.1361
Iteration 30, Loss: 0.0921
Iteration 40, Loss: 0.0669
Iteration 50, Loss: 0.0570
Iteration 60, Loss: 0.0521
Iteration 70, Loss: 0.0505
Iteration 80, Loss: 0.0490
Iteration 90, Loss: 0.0478
Iteration 100, Loss: 0.0455
Iteration 110, Loss: 0.0435
Iteration 120, Loss: 0.0418
Iteration 130, Loss: 0.0403
Iteration 140, Loss: 0.0394


In [39]:
rslora_model = create_lora_model(
    model,
    lora_layer_indices=[0, 2, 4],
    lora_layer_class=partial(
        LinearWithLoRA,
        rank=20,
        alpha=ALPHA,
        lora_layer_class=RsLoRALayer,
    ),
)
anim = create_training_animation(
    rslora_model,
    x,
    y2,
    torch.optim.Adam(rslora_model.parameters(), lr=LEARNING_RATE),
)
HTML(anim.to_html5_video())

Initial Loss: 0.3533
Initial Loss: 0.3533
Iteration 10, Loss: 0.0628
Iteration 20, Loss: 0.0409
Iteration 30, Loss: 0.0331
Iteration 40, Loss: 0.0261
Iteration 50, Loss: 0.0181
Iteration 60, Loss: 0.0149
Iteration 70, Loss: 0.0122
Iteration 80, Loss: 0.0119
Iteration 90, Loss: 0.0108
Iteration 100, Loss: 0.0098
Iteration 110, Loss: 0.0091
Iteration 120, Loss: 0.0087
Iteration 130, Loss: 0.0082
Iteration 140, Loss: 0.0077
