In [None]:
!pip install rouge
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
# install additional dependencies needed for training
!pip install rouge-score tensorboard py7zr
!pip install datasets

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K  

In [None]:
!pip install einops

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
torch.set_default_device('cuda')


In [None]:
torch.set_default_device('cuda')
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", trust_remote_code=True, torch_dtype="auto")
inputs = tokenizer('''```python
def print_prime(n):
   """
   Print all primes between 1 and n
   """''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

## gemma code running but 0 compression

In [None]:
# torch.set_default_device('cuda')
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", trust_remote_code=True, torch_dtype="auto")
# tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", trust_remote_code=True, torch_dtype="auto")
# inputs = tokenizer('''```python
# def print_prime(n):
#    """
#    Print all primes between 1 and n
#    """''', return_tensors="pt", return_attention_mask=False)

# outputs = model.generate(**inputs, max_length=200)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)

In [None]:
# torch.set_default_device('cuda')
# model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-chat-hf", trust_remote_code=True, torch_dtype="auto")
# tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf", trust_remote_code=True, torch_dtype="auto")
# inputs = tokenizer('''```python
# def print_prime(n):
#    """
#    Print all primes between 1 and n
#    """''', return_tensors="pt", return_attention_mask=False)

# outputs = model.generate(**inputs, max_length=200)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)

In [None]:
# torch.set_default_device('cuda')
# model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
# inputs = tokenizer('''```python
# def print_prime(n):
#    """
#    Print all primes between 1 and n
#    """''', return_tensors="pt", return_attention_mask=False)

# outputs = model.generate(**inputs, max_length=200)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)

In [None]:
from dataclasses import dataclass

@dataclass
class LowRankConfig:
    rank:int
    target_modules: list[str]

In [None]:
model

In [None]:
#low rank decomposition of SelfAttention Key, Query and Value Matrices
config = LowRankConfig(
    rank= 384,
    target_modules=["Wqkv"]
)

In [None]:
from torch import nn
from dataclasses import dataclass
from torch.nn import functional as F
class LowRankLayer(nn.Module):
    """given a linear layer find low rank decomposition"""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.double())
        S_diag = torch.diag(S)
        self.U = U[:, :self.rank].half()
        self.S = S_diag[:self.rank, :self.rank].half()
        self.Vh = Vh[:self.rank, :].half()

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix)
        return output


In [None]:
#find the module that ends target suffix
def get_submodules(model, key):
    parent = model.get_submodule(".".join(key.split(".")[:-1]))
    target_name = key.split(".")[-1]
    target = model.get_submodule(key)
    return parent, target, target_name

# this function replaces a target layer with low rank layer
def recursive_setattr(obj, attr, value):
    attr = attr.split('.', 1)
    if len(attr) == 1:
        setattr(obj, attr[0], value)
    else:
        recursive_setattr(getattr(obj, attr[0]), attr[1], value)


In [None]:
import copy
model_lr = copy.deepcopy(model)


In [None]:
for key, module in model.named_modules():
    target_module_found = any(key.endswith("." + target_key) for target_key in config.target_modules)
    if target_module_found:
        low_rank_layer = LowRankLayer(config.rank, module)
        #replace target layer with low rank layer
        recursive_setattr(model_lr, key, low_rank_layer)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

In [None]:
print_trainable_parameters(model_lr)

In [None]:
1-1116133376/1418270720

0.2130322086886205

In [None]:
model.save_pretrained("model", from_pt=True)


In [None]:
model_lr.save_pretrained("model_lr", from_pt=True)


In [None]:
!ls -lh model/pytorch_model.bin

ls: cannot access 'model/pytorch_model.bin': No such file or directory


In [None]:
!ls -lh model_lr/pytorch_model.bin

ls: cannot access 'model_lr/pytorch_model.bin': No such file or directory


In [None]:
1-2.1/2.7

0.2222222222222222

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_default_device('cuda')
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype="auto")
inputs = tokenizer('''```python
def add(x,y):
   ''', return_tensors="pt", return_attention_mask=False)

outputs = model_lr.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
inputs = tokenizer('''```python
def compare(a, b):
    Compares two strings
   ''', return_tensors="pt", return_attention_mask=False)

outputs = model_lr.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

In [None]:
inputs = tokenizer('''```python
def sum(array):
    For loop to compute sum of numbers in an array
   ''', return_tensors="pt", return_attention_mask=False)

outputs = model_lr.generate(**inputs, max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)