In [35]:
import torch
import torch.nn as nn
import torch.functional as F

In [36]:
x = torch.rand((1000,20))

y = (torch.sin(x.sum(1))>0).long()

unique, counts = torch.unique(y, return_counts=True)
distribution = dict(zip(unique.tolist(), counts.tolist()))

n_train = 800
batch_size = 64

In [37]:
#dataloader
train_dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(x[:n_train],y[:n_train]),batch_size=batch_size,shuffle=True,)

eval_dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(x[n_train:],y[n_train:]),batch_size=batch_size)

In [38]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(20,2000),
            nn.ReLU(),
            nn.Linear(2000,200),
            nn.ReLU(),
            nn.Linear(200,2),
            nn.LogSoftmax(dim=-1),
        )
    def forward(self,x):
        return self.seq(x)
    
lr = 0.002
batch_size = 64
max_epochs = 35
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [39]:
device

'cpu'

In [40]:
def train(model, optimizer, criterion, train_dataloader, eval_dataloader, epochs):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for xb, yb in train_dataloader:
            xb = xb.to(device)
            yb = yb.to(device)
            outputs = model(xb)
            loss = criterion(outputs, yb)
            train_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        eval_loss = 0
        for xb, yb in eval_dataloader:
            xb = xb.to(device)
            yb = yb.to(device)
            with torch.no_grad():
                outputs = model(xb)
            loss = criterion(outputs, yb)
            eval_loss += loss.detach().float()

        eval_loss_total = (eval_loss / len(eval_dataloader)).item()
        train_loss_total = (train_loss / len(train_dataloader)).item()
        print(f"{epoch=:<2}  {train_loss_total=:.4f}  {eval_loss_total=:.4f}")

In [41]:
base_model = MLP().to(device)
optimizer = torch.optim.Adam(base_model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()


In [42]:
base_model

MLP(
  (seq): Sequential(
    (0): Linear(in_features=20, out_features=2000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2000, out_features=200, bias=True)
    (3): ReLU()
    (4): Linear(in_features=200, out_features=2, bias=True)
    (5): LogSoftmax(dim=-1)
  )
)

In [43]:
#train params
def trainable_parameters(model):
    trainable_params = 0
    all_params = 0
    for _,params in model.named_parameters():
        all_params+=params.numel()
        if params.requires_grad:
            trainable_params+=params.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}")

In [44]:
trainable_parameters(base_model)

trainable params: 442602 || all params: 442602 || trainable%: 100.0


In [45]:
train(base_model, optimizer, criterion, train_dataloader, eval_dataloader, epochs=20)

epoch=0   train_loss_total=0.6533  eval_loss_total=0.6142
epoch=1   train_loss_total=0.5644  eval_loss_total=0.5480
epoch=2   train_loss_total=0.4719  eval_loss_total=0.4257
epoch=3   train_loss_total=0.3922  eval_loss_total=0.4282
epoch=4   train_loss_total=0.3832  eval_loss_total=0.3424
epoch=5   train_loss_total=0.3882  eval_loss_total=0.4349
epoch=6   train_loss_total=0.3559  eval_loss_total=0.3626
epoch=7   train_loss_total=0.3831  eval_loss_total=0.3833
epoch=8   train_loss_total=0.3440  eval_loss_total=0.3817
epoch=9   train_loss_total=0.2995  eval_loss_total=0.3541
epoch=10  train_loss_total=0.2626  eval_loss_total=0.3377
epoch=11  train_loss_total=0.2584  eval_loss_total=0.3602
epoch=12  train_loss_total=0.2270  eval_loss_total=0.3462
epoch=13  train_loss_total=0.2291  eval_loss_total=0.3808
epoch=14  train_loss_total=0.2298  eval_loss_total=0.3745
epoch=15  train_loss_total=0.2051  eval_loss_total=0.3491
epoch=16  train_loss_total=0.1968  eval_loss_total=0.5074
epoch=17  trai

In [46]:
print(torch.cuda.is_available())  # Should return True if CUDA is available

False


In [47]:
!python -m pip install --upgrade peft

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [48]:
import copy
import os

os.environ["BITSANDBYTES_NOWELCOME"] = "1"

In [49]:
import peft

In [50]:
# Let's identify the names of the modules, ensuring that we fine-tune the appropriate ones with adaptors.
[(n, type(m)) for n, m in base_model.named_modules()]

[('', __main__.MLP),
 ('seq', torch.nn.modules.container.Sequential),
 ('seq.0', torch.nn.modules.linear.Linear),
 ('seq.1', torch.nn.modules.activation.ReLU),
 ('seq.2', torch.nn.modules.linear.Linear),
 ('seq.3', torch.nn.modules.activation.ReLU),
 ('seq.4', torch.nn.modules.linear.Linear),
 ('seq.5', torch.nn.modules.activation.LogSoftmax)]

In [51]:
#shape
[(n, type(m), [p.shape for p in m.parameters()]) for n, m in base_model.named_modules()]

[('',
  __main__.MLP,
  [torch.Size([2000, 20]),
   torch.Size([2000]),
   torch.Size([200, 2000]),
   torch.Size([200]),
   torch.Size([2, 200]),
   torch.Size([2])]),
 ('seq',
  torch.nn.modules.container.Sequential,
  [torch.Size([2000, 20]),
   torch.Size([2000]),
   torch.Size([200, 2000]),
   torch.Size([200]),
   torch.Size([2, 200]),
   torch.Size([2])]),
 ('seq.0',
  torch.nn.modules.linear.Linear,
  [torch.Size([2000, 20]), torch.Size([2000])]),
 ('seq.1', torch.nn.modules.activation.ReLU, []),
 ('seq.2',
  torch.nn.modules.linear.Linear,
  [torch.Size([200, 2000]), torch.Size([200])]),
 ('seq.3', torch.nn.modules.activation.ReLU, []),
 ('seq.4',
  torch.nn.modules.linear.Linear,
  [torch.Size([2, 200]), torch.Size([2])]),
 ('seq.5', torch.nn.modules.activation.LogSoftmax, [])]

In [52]:
config = peft.LoraConfig(r=3,target_modules=["seq.0","seq.2"],)

In [53]:
base_model_pretrained = copy.deepcopy(base_model) #copy for pretraining
peft_model = peft.get_peft_model(base_model,config)
optim_model = torch.optim.Adam(peft_model.parameters(),lr=lr)
criterion = nn.CrossEntropyLoss()
peft_model.print_trainable_parameters()

trainable params: 12,660 || all params: 455,262 || trainable%: 2.7808


In [54]:
peft_model

PeftModel(
  (base_model): LoraModel(
    (model): MLP(
      (seq): Sequential(
        (0): lora.Linear(
          (base_layer): Linear(in_features=20, out_features=2000, bias=True)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=20, out_features=3, bias=False)
          )
          (lora_B): ModuleDict(
            (default): Linear(in_features=3, out_features=2000, bias=False)
          )
          (lora_embedding_A): ParameterDict()
          (lora_embedding_B): ParameterDict()
          (lora_magnitude_vector): ModuleDict()
        )
        (1): ReLU()
        (2): lora.Linear(
          (base_layer): Linear(in_features=2000, out_features=200, bias=True)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=2000, out_features=3, bias=False)
          )
          (lora_

In [55]:
[(n, type(m), [p.shape for p in m.parameters()]) for n, m in peft_model.named_modules()]

[('',
  peft.peft_model.PeftModel,
  [torch.Size([2000, 20]),
   torch.Size([2000]),
   torch.Size([3, 20]),
   torch.Size([2000, 3]),
   torch.Size([200, 2000]),
   torch.Size([200]),
   torch.Size([3, 2000]),
   torch.Size([200, 3]),
   torch.Size([2, 200]),
   torch.Size([2])]),
 ('base_model',
  peft.tuners.lora.model.LoraModel,
  [torch.Size([2000, 20]),
   torch.Size([2000]),
   torch.Size([3, 20]),
   torch.Size([2000, 3]),
   torch.Size([200, 2000]),
   torch.Size([200]),
   torch.Size([3, 2000]),
   torch.Size([200, 3]),
   torch.Size([2, 200]),
   torch.Size([2])]),
 ('base_model.model',
  __main__.MLP,
  [torch.Size([2000, 20]),
   torch.Size([2000]),
   torch.Size([3, 20]),
   torch.Size([2000, 3]),
   torch.Size([200, 2000]),
   torch.Size([200]),
   torch.Size([3, 2000]),
   torch.Size([200, 3]),
   torch.Size([2, 200]),
   torch.Size([2])]),
 ('base_model.model.seq',
  torch.nn.modules.container.Sequential,
  [torch.Size([2000, 20]),
   torch.Size([2000]),
   torch.Size(

In [56]:
lora_B = peft_model.state_dict()['base_model.model.seq.0.lora_B.default.weight']
lora_A = peft_model.state_dict()['base_model.model.seq.0.lora_A.default.weight']

In [57]:
print(lora_A.size())
print(lora_B.size())

torch.Size([3, 20])
torch.Size([2000, 3])


In [58]:
#weight of adaptor
#hpreact = X(input)* alpha + bias
train(peft_model, optimizer, criterion, train_dataloader, eval_dataloader, epochs=5)

epoch=0   train_loss_total=0.1184  eval_loss_total=0.3984
epoch=1   train_loss_total=0.1167  eval_loss_total=0.3984
epoch=2   train_loss_total=0.1209  eval_loss_total=0.3984
epoch=3   train_loss_total=0.1165  eval_loss_total=0.3984
epoch=4   train_loss_total=0.1185  eval_loss_total=0.3984


In [59]:
#finetuned dataset
x = torch.rand((500,20))

y = (torch.sin(x.sum(1))>10).long()

n_train = 300
batch_size = 64

In [60]:
import torch.utils.data.dataloader


train_dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(x[:n_train],y[:n_train]),batch_size=batch_size,shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(x[n_train:],y[n_train:]),batch_size=batch_size)

%time train(peft_model, optimizer, criterion, train_dataloader, eval_dataloader, epochs=10)

epoch=0   train_loss_total=0.7243  eval_loss_total=0.9177
epoch=1   train_loss_total=0.7464  eval_loss_total=0.9177
epoch=2   train_loss_total=0.7164  eval_loss_total=0.9177
epoch=3   train_loss_total=0.7230  eval_loss_total=0.9177
epoch=4   train_loss_total=0.7201  eval_loss_total=0.9177
epoch=5   train_loss_total=0.7225  eval_loss_total=0.9177
epoch=6   train_loss_total=0.7477  eval_loss_total=0.9177
epoch=7   train_loss_total=0.7518  eval_loss_total=0.9177
epoch=8   train_loss_total=0.7400  eval_loss_total=0.9177
epoch=9   train_loss_total=0.7254  eval_loss_total=0.9177
CPU times: total: 922 ms
Wall time: 258 ms


In [61]:
device

'cpu'

In [62]:
print("** Pretrained based model's parameters **")
for name, param in base_model_pretrained.named_parameters():
  print(name)
print()
print("** Peft model's parameters **")
for name, param in peft_model.named_parameters():
  print(name)

** Pretrained based model's parameters **
seq.0.weight
seq.0.bias
seq.2.weight
seq.2.bias
seq.4.weight
seq.4.bias

** Peft model's parameters **
base_model.model.seq.0.base_layer.weight
base_model.model.seq.0.base_layer.bias
base_model.model.seq.0.lora_A.default.weight
base_model.model.seq.0.lora_B.default.weight
base_model.model.seq.2.base_layer.weight
base_model.model.seq.2.base_layer.bias
base_model.model.seq.2.lora_A.default.weight
base_model.model.seq.2.lora_B.default.weight
base_model.model.seq.4.weight
base_model.model.seq.4.bias


In [63]:
print(torch.equal(base_model_pretrained.state_dict()['seq.0.weight'], peft_model.state_dict()['base_model.model.seq.0.base_layer.weight']))
print(torch.equal(base_model_pretrained.state_dict()['seq.0.bias'], peft_model.state_dict()['base_model.model.seq.0.base_layer.bias']))
print(torch.equal(base_model_pretrained.state_dict()['seq.2.weight'], peft_model.state_dict()['base_model.model.seq.2.base_layer.weight']))
print(torch.equal(base_model_pretrained.state_dict()['seq.2.bias'], peft_model.state_dict()['base_model.model.seq.2.base_layer.bias']))
print(torch.equal(base_model_pretrained.state_dict()['seq.4.weight'], peft_model.state_dict()['base_model.model.seq.4.weight']))
print(torch.equal(base_model_pretrained.state_dict()['seq.4.bias'], peft_model.state_dict()['base_model.model.seq.4.bias']))

True
True
True
True
True
True


In [65]:
trainable_parameters(base_model_pretrained)
trainable_parameters(peft_model)

trainable params: 442602 || all params: 442602 || trainable%: 100.0
trainable params: 12660 || all params: 455262 || trainable%: 2.780816321151337


In [66]:
config_1 = peft.LoraConfig(
    r=3,
    target_modules=["seq.0", "seq.2"],
    modules_to_save=["seq.4"],
)

In [67]:
copy_1 = copy.deepcopy(base_model_pretrained) # keep the orginal as is and work on a copy
peft_model_1 = peft.get_peft_model(copy_1, config_1)
optimizer = torch.optim.Adam(peft_model_1.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
peft_model_1.print_trainable_parameters()

trainable params: 13,062 || all params: 455,664 || trainable%: 2.8666


In [68]:
peft_model_1

PeftModel(
  (base_model): LoraModel(
    (model): MLP(
      (seq): Sequential(
        (0): lora.Linear(
          (base_layer): Linear(in_features=20, out_features=2000, bias=True)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=20, out_features=3, bias=False)
          )
          (lora_B): ModuleDict(
            (default): Linear(in_features=3, out_features=2000, bias=False)
          )
          (lora_embedding_A): ParameterDict()
          (lora_embedding_B): ParameterDict()
          (lora_magnitude_vector): ModuleDict()
        )
        (1): ReLU()
        (2): lora.Linear(
          (base_layer): Linear(in_features=2000, out_features=200, bias=True)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict(
            (default): Linear(in_features=2000, out_features=3, bias=False)
          )
          (lora_

In [69]:
peft_model_unmerged = copy.deepcopy(peft_model)
peft_model_merged_and_unloaded = peft_model.merge_and_unload()

In [71]:
trainable_parameters(peft_model_merged_and_unloaded)

trainable params: 0 || all params: 442602 || trainable%: 0.0


In [72]:
for name, param in peft_model.base_model.named_parameters():
    if "lora" not in name:
        print(f"New parameter {name:<35} | {param.numel():>15} parameters | not updated")
        continue

    print(f"New parameter {name:<35} | {param.numel():>15} parameters | updated")

New parameter model.seq.0.weight                  |           40000 parameters | not updated
New parameter model.seq.0.bias                    |            2000 parameters | not updated
New parameter model.seq.2.weight                  |          400000 parameters | not updated
New parameter model.seq.2.bias                    |             200 parameters | not updated
New parameter model.seq.4.weight                  |             400 parameters | not updated
New parameter model.seq.4.bias                    |               2 parameters | not updated
