In [37]:
import torch
import torch.nn as nn
import math

# Simple model with different parameter types
class ToyModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Matrix parameters (2D)
        self.weight1 = nn.Parameter(torch.randn(4, 8))
        self.weight2 = nn.Parameter(torch.randn(8, 2))
        # Vector parameters (1D)
        self.bias1 = nn.Parameter(torch.randn(8))
        self.bias2 = nn.Parameter(torch.randn(2))
        
    def forward(self, x):
        x = x @ self.weight1 + self.bias1
        x = x @ self.weight2 + self.bias2
        return x

def loss_fn(x, y): # MSE loss 
    return (x - y).norm()

# Parameter group example
model = ToyModel()

input = torch.randn(4)
output = model(input)
loss = loss_fn(output, torch.randn(2))

# Ok, so i can choose my own parameter groups when initializing the optimizer
# - but once specified, parameter groups are fixed within the optimizer

# Method 1: Using a single parameter group
opt1 = torch.optim.SGD(model.parameters(), lr=0.1)
print(f"Optimizer 1 has {len(opt1.param_groups)} parameter group(s)")

# Method 2: Using multiple parameter groups with different learning rates
opt2 = torch.optim.SGD([
    {'params': [model.weight1, model.weight2], 'lr': 0.1},
    {'params': [model.bias1, model.bias2], 'lr': 0.01}
])
print(f"Optimizer 2 has {len(opt2.param_groups)} parameter group(s)")

# Print learning rates for each group
for i, group in enumerate(opt2.param_groups):
    print(f"Group {i} has lr={group['lr']} and contains {len(group['params'])} parameters")


Optimizer 1 has 1 parameter group(s)
Optimizer 2 has 2 parameter group(s)
Group 0 has lr=0.1 and contains 2 parameters
Group 1 has lr=0.01 and contains 2 parameters


In [48]:
# loss.backward()
# regarding

def decide_rank(loss, max_loss=None, max_rank: int = 8, min_rank: int = 1):
    # Use 'maximal loss' to decide rank value
    if not max_loss: 
        max_loss = loss.item() 
    # ratio to decide rank value
    return max(min_rank, int(loss.item() / max_loss * max_rank))

In [66]:
# gradient 
for p in model.parameters(): 
    if p.ndim == 2: 
       # max_loss = self.state[p]['max_loss']
       break 

p # tensor value 
p.grad # gradient value
orig_grad = p.grad  
r = decide_rank(loss, None, max_rank=min(p.shape))

# SVD (2 rotation matrices, one vector of singular values)
U, S, V = torch.linalg.svd(orig_grad, full_matrices=True)

# Slicing important components from SVD results
U[:,:r], S[:r], V[:r,:]

# then accumulate 1st & 2nd moment of sliced values above
# note - the 2nd moment should be 'row-wise norm' for V and 'column-wise norm' for U 

# are we missing the rotation matrix decomposition with magnitude vector, 
# - or do we have it already with SVD (U, V matrices) ?

In [60]:
U.shape, V.shape, orig_grad.shape

(torch.Size([4, 4]), torch.Size([8, 8]), torch.Size([4, 8]))

In [67]:
S

tensor([1.3848e+01, 6.6384e-07, 1.2254e-07, 8.4567e-09])

In [None]:
# Traditional Optimizer assumes 'gradient' and 'parameter' are fixed
# - Our idea combines 'wrapping low-rank adaptor' and call .backward() with optimization gadegts together 
# - in terms of code this is not just a custom optimizer, there needs to be another functionality happening before the .backward() functional .... 
