In [28]:
import torch
from arc.model import GPTModel
from peft.lora import replace_linear_with_lora

In [29]:
def get_model_trainable_params(model: torch.nn.Module):
    return  sum(p.numel() for p in model.parameters() if p.requires_grad)

In [30]:
GPT_CONFIG_124M = {
 "vocab_size": 50257, # Vocabulary size
 "context_length": 1024, # Context length
 "emb_dim": 768, # Embedding dimension
 "n_heads": 12, # Number of attention heads
 "n_layers": 12, # Number of layers
 "drop_rate": 0.1, # Dropout rate
 "qkv_bias": False # Query-Key-Value bias
}

In [31]:
model = GPTModel(cfg=GPT_CONFIG_124M)
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_feature

In [32]:
all_train_params = get_model_trainable_params(model=model)
print(f"Total trainable parameters before: {all_train_params:,}")


Total trainable parameters before: 163,009,536


In [33]:
# freezing all model'parameters
for param in model.parameters():
 param.requires_grad = False

In [34]:
print(f"Total trainable parameters after freezing: {get_model_trainable_params(model=model):,}")

Total trainable parameters after freezing: 0


In [35]:
replace_linear_with_lora(model, rank=32, alpha=32)

In [36]:
model # architecture of the model after applying LoRA

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=False)
          (lora): LoRALayer()
        )
        (W_key): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=False)
          (lora): LoRALayer()
        )
        (W_value): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=False)
          (lora): LoRALayer()
        )
        (out_proj): LinearWithLoRA(
          (linear): Linear(in_features=768, out_features=768, bias=True)
          (lora): LoRALayer()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): LinearWithLoRA(
            (linear): Linear(in_features=768, out_fe

In [37]:
trainable_lora_params = get_model_trainable_params(model=model)
print(f"Total trainable parameters after applyting LoRA: {trainable_lora_params:,} | {trainable_lora_params * 100 / all_train_params:.3}% of total {all_train_params:,}")

Total trainable parameters after applyting LoRA: 6,941,216 | 4.26% of total 163,009,536
