# Compression happening

## distilbert = 60%

## T5 = 60%

## T5 base 83.00%

## funnel_transformer smallbase 68%



## T5 base 83.00%

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


  from .autonotebook import tqdm as notebook_tqdm


Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

In [3]:
!pip install sentencepiece


Collecting sentencepiece
  Using cached sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Using cached sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Better script with storing models only in temp, and more print statements

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import tempfile
import os

# Specify the model name and compression rank
MODEL_NAME = 'google/t5-v1_1-base'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the size of a model saved to a temporary file
def get_model_size_in_bytes(model):
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        torch.save(model.state_dict(), tmp_file.name)
        tmp_file.seek(0, os.SEEK_END)
        size = tmp_file.tell()
    os.remove(tmp_file.name)  # Delete the temporary file immediately after getting its size
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_size_params = count_parameters(model)
original_size_bytes = get_model_size_in_bytes(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final parameter count and file size
compressed_size_params = count_parameters(model)
compressed_size_bytes = get_model_size_in_bytes(model)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_size_params}")
print(f"Compressed model size (parameters): {compressed_size_params}")
print(f"Parameter compression rate: {(original_size_params - compressed_size_params) / original_size_params:.2%}")

print(f"Original model size (bytes): {original_size_bytes / (1024 ** 2):.2f} MB")
print(f"Compressed model size (bytes): {compressed_size_bytes / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_size_bytes - compressed_size_bytes) / original_size_bytes:.2%}")

# Save the tokenizer and compressed model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Compressed model saved to directory: {model_dir}")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi_0
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi_1
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi_0
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi_1
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.bloc

## Similar to SVDLLM

In [8]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import tempfile
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset

# Specify the model name and compression rank
MODEL_NAME = 'google/t5-v1_1-base'
COMPRESSION_RANK = 32

# Dummy dataset for calibration (replace with your actual dataset)
class DummyDataset(Dataset):
    def __init__(self, size=100, seq_len=512, vocab_size=32128):
        self.size = size
        self.seq_len = seq_len
        self.vocab_size = vocab_size

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        input_ids = torch.randint(0, self.vocab_size, (self.seq_len,))
        attention_mask = torch.ones(self.seq_len)
        return {"input_ids": input_ids, "attention_mask": attention_mask}

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            if '.' in name:
                parent_name, child_name = name.rsplit('.', 1)
                parent_module = model.get_submodule(parent_name)
            else:
                parent_module = model
                child_name = name
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the size of a model saved to a temporary file
def get_model_size_in_bytes(model):
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        torch.save(model.state_dict(), tmp_file.name)
        tmp_file.seek(0, os.SEEK_END)
        size = tmp_file.tell()
    os.remove(tmp_file.name)  # Delete the temporary file immediately after getting its size
    return size

# Function to find layers in a module
def find_layers(module, layers=None):
    if layers is None:
        layers = {}
    for name, sub_module in module.named_children():
        if isinstance(sub_module, nn.Linear):
            layers[name] = sub_module
        else:
            find_layers(sub_module, layers)
    return layers

# Function to perform profiling and whitening
@torch.no_grad()
def profile_and_whiten(model, calib_loader, dev):
    layers = model.encoder.block + model.decoder.block
    model = model.to(dev)
    print("Start obtaining the whitening matrix...")

    def hook(module, input, output):
        inp = input[0].detach().float()
        if inp.dim() == 2:   # For some layers
            inp = inp.unsqueeze(0)
        adds = torch.matmul(inp.transpose(1, 2), inp)
        adds_sum = torch.sum(adds, dim=0)
        if hasattr(module, 'raw_scaling_diag_matrix'):
            module.raw_scaling_diag_matrix += adds_sum
        del inp, adds, adds_sum
        torch.cuda.empty_cache()

    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            module.raw_scaling_diag_matrix = 0
            module.register_forward_hook(hook)

    for batch in tqdm(calib_loader):
        batch = {k: v.to(dev) for k, v in batch.items()}
        model(**batch)

    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            module._forward_hooks.clear()
    torch.cuda.empty_cache()

    model = model.cpu()
    profiling_mat = {}
    print("Start Cholesky Decomposition...")

    for layer in layers:
        subset = find_layers(layer)
        for name in subset:
            if not hasattr(subset[name], 'raw_scaling_diag_matrix'):
                continue
            raw_scaling_diag_matrix = subset[name].raw_scaling_diag_matrix.double().to(dev)
            try:
                scaling_diag_matrix = torch.linalg.cholesky(raw_scaling_diag_matrix)
            except Exception as e:
                print("Warning: eigen scaling_diag_matrix is not positive!")
                eigenvalues = torch.linalg.eigvalsh(raw_scaling_diag_matrix)
                raw_scaling_diag_matrix += (-eigenvalues[0] + 1e-6) * torch.eye(raw_scaling_diag_matrix.shape[0]).to(dev)
                scaling_diag_matrix = torch.linalg.cholesky(raw_scaling_diag_matrix)
            profiling_mat[name] = scaling_diag_matrix.cpu()
            del scaling_diag_matrix, raw_scaling_diag_matrix, subset[name].raw_scaling_diag_matrix
            torch.cuda.empty_cache()
    return profiling_mat

# Function to perform whitening and low-rank decomposition
@torch.no_grad()
def whitening(model_name, model, profiling_mat, ratio, dev):
    model.eval()
    layers = model.encoder.block + model.decoder.block
    print("Start SVD decomposition after whitening...")

    for layer in layers:
        subset = find_layers(layer)
        for name in subset:
            if not hasattr(subset[name], 'weight'):
                continue
            W = subset[name].weight.data.float().to(dev)
            dtype = W.dtype
            scaling_diag_matrix = profiling_mat.get(name, None)
            if scaling_diag_matrix is None:
                continue
            scaling_diag_matrix = scaling_diag_matrix.to(dev)
            try:
                scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
            except Exception as e:
                print("Warning: scaling_diag_matrix is not full rank!")
                scaling_diag_matrix += 1e-6 * torch.eye(scaling_diag_matrix.shape[0]).to(dev)
                scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
            W_scale = torch.matmul(W, scaling_diag_matrix)
            U, S, VT = torch.linalg.svd(W_scale, full_matrices=False)
            num_s_after_trunc = int(W.shape[0] * W.shape[1] * ratio / (W.shape[0] + W.shape[1]))
            truc_s = S[:num_s_after_trunc]
            truc_u = U[:, :num_s_after_trunc]
            truc_v = torch.matmul(VT[:num_s_after_trunc, :], scaling_matrix_inv)
            truc_sigma = torch.diag(truc_s)
            sqrtSigma = torch.sqrt(truc_sigma)
            svd_u = torch.matmul(truc_u, sqrtSigma).cpu().to(dtype)
            svd_v = torch.matmul(sqrtSigma, truc_v).cpu().to(dtype)
            low_rank_layer = LowRankLayer(svd_u.size(1), subset[name])
            low_rank_layer.U = nn.Parameter(svd_u)
            low_rank_layer.S = nn.Parameter(truc_sigma)
            low_rank_layer.Vh = nn.Parameter(svd_v)
            if '.' in name:
                parent_name, child_name = name.rsplit('.', 1)
                parent_module = model.get_submodule(parent_name)
            else:
                parent_module = model
                child_name = name
            setattr(parent_module, child_name, low_rank_layer)
            del W, W_scale, scaling_matrix_inv, scaling_diag_matrix, U, S, VT, truc_s, truc_u, truc_v, sqrtSigma
            torch.cuda.empty_cache()

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_size_params = count_parameters(model)
original_size_bytes = get_model_size_in_bytes(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Profiling and Whitening
calib_dataset = DummyDataset()
calib_loader = DataLoader(calib_dataset, batch_size=4, shuffle=True)
profiling_mat = profile_and_whiten(model, calib_loader, 'cuda')
whitening(MODEL_NAME, model, profiling_mat, COMPRESSION_RANK / original_size_params, 'cuda')

# Get final parameter count and file size
compressed_size_params = count_parameters(model)
compressed_size_bytes = get_model_size_in_bytes(model)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_size_params}")
print(f"Compressed model size (parameters): {compressed_size_params}")
print(f"Parameter compression rate: {(original_size_params - compressed_size_params) / original_size_params:.2%}")

print(f"Original model size (bytes): {original_size_bytes / (1024 ** 2):.2f} MB")
print(f"Compressed model size (bytes): {compressed_size_bytes / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_size_bytes - compressed_size_bytes) / original_size_bytes:.2%}")

# Save the tokenizer and compressed model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Compressed model saved to directory: {model_dir}")


Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi_0
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi_1
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi_0
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi_1
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.bloc

  0%|          | 0/25 [00:00<?, ?it/s]


AttributeError: 'LowRankLayer' object has no attribute 'weight'

## LLama3 Trial

In [1]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
pipeline("Hey how are you doing today?")


  from .autonotebook import tqdm as notebook_tqdm


Downloading shards: 100%|██████████| 4/4 [00:55<00:00, 13.75s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.05it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Hey how are you doing today? I’m really excited to be here, I’m going to be talking about the top 5 mistakes that people make when it comes to their finances and how to avoid them. So, let’s get started. Number one, not having a budget. This is one of the biggest mistakes that people make, because without a budget you don’t know where your money is going and you’re more likely to overspend. So, the first thing you need to do is sit down and create a budget for yourself. This will help you track your spending and make sure that you’re staying on track. Number two, not saving enough for retirement. This is another big mistake that people make, because they don’t realize how much they need to save for retirement until it’s too late. So, make sure that you’re saving at least 10% of your income every month for retirement. Number three, not having an emergency fund. This is something that everyone should have, because you never know when an emergency is going to happen a

In [1]:
import transformers
import torch

model_id = "pavan01729/Compressed_LLama3_8b"

pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
pipeline("Hey how are you doing today?")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at pavan01729/Compressed_LLama3_8b and are newly initialized: ['layers.13.self_attn.k_proj.weight', 'layers.21.self_attn.o_proj.weight', 'layers.9.mlp.up_proj.weight', 'layers.21.self_attn.q_proj.weight', 'layers.23.mlp.up_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.4.self_attn.q_proj.weight', 'layers.8.self_attn.o_proj.weight', 'layers.11.mlp.gate_proj.weight', 'layers.11.self_attn.q_proj.weight', 'layers.26.mlp.up_proj.weight', 'layers.27.self_attn.v_proj.weight', 'layers.25.mlp.down_proj.weight', 'layers.31.mlp.up_proj.weight', 'layers.5.self_attn.k_proj.weight', 'layers.23.mlp.gate_proj.weight', 'layers.17.mlp.up_proj.weight', 'layers.15.mlp.up_proj.weight', 'layers.16.mlp.gate_proj.weight', 'layers.19.mlp.down_proj.weight', 'layers.18.self_attn.o_proj.weight', 'layers.17.self_attn.q_proj.weight', 'layers.6.mlp.up_proj.weight', 'layers.15.m

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [13]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

In [14]:
from huggingface_hub import HfApi, login
from transformers import AutoTokenizer, AutoModelForCausalLM

# Log in to Hugging Face
login(token="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")

# Define the model repository name
repo_name = "pavan01729/Compressed_LLama3_8b"  # Replace with your desired repo name

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True)
tokenizer.push_to_hub(repo_name, check_pr=True)

print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Model pushed to Hugging Face at: https://huggingface.co/pavan01729/Compressed_LLama3_8b


## LLama2-7b Original

In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_RANK = 128  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")




Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   1%|          | 73.4M/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

In [2]:
from huggingface_hub import HfApi, login
from transformers import AutoTokenizer, AutoModelForCausalLM

# Log in to Hugging Face
login(token="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")

# Define the model repository name
repo_name = "pavan01729/Compressed_LLama2_7b"  # Replace with your desired repo name

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True)
tokenizer.push_to_hub(repo_name, check_pr=True)

print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


pytorch_model.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Model pushed to Hugging Face at: https://huggingface.co/pavan01729/Compressed_LLama2_7b


In [4]:
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [9]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import tempfile
import os

# Specify the model name and compression rank
MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.2'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the size of a model saved to a temporary file
def get_model_size_in_bytes(model):
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        torch.save(model.state_dict(), tmp_file.name)
        tmp_file.seek(0, os.SEEK_END)
        size = tmp_file.tell()
    os.remove(tmp_file.name)  # Delete the temporary file immediately after getting its size
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_size_params = count_parameters(model)
original_size_bytes = get_model_size_in_bytes(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final parameter count and file size
compressed_size_params = count_parameters(model)
compressed_size_bytes = get_model_size_in_bytes(model)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_size_params}")
print(f"Compressed model size (parameters): {compressed_size_params}")
print(f"Parameter compression rate: {(original_size_params - compressed_size_params) / original_size_params:.2%}")

print(f"Original model size (bytes): {original_size_bytes / (1024 ** 2):.2f} MB")
print(f"Compressed model size (bytes): {compressed_size_bytes / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_size_bytes - compressed_size_bytes) / original_size_bytes:.2%}")

# Save the tokenizer and compressed model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Compressed model saved to directory: {model_dir}")


Collecting transformers
  Using cached transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.42.3-py3-none-any.whl (9.3 MB)
Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.31.0
    Uninstalling transformers-4.31.0:
      Successfully uninstalled transformers-4.31.0
Successfully installed tokenizers-0.19.1 transformers-4.42.3
Error loading model or tokenizer: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 40