# Compression happening

## distilbert = 60%

## T5 = 60%

## T5 base 83.00%

## funnel_transformer smallbase 68%



In [18]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model
from torch.nn import functional as F

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank])
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank])
        self.Vh = nn.Parameter(Vh[:self.rank, :])

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
rank = 32  # Adjust this for more or less aggressive compression
model = replace_with_low_rank(model, rank)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Test the compressed model with a random input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors='pt')
output = model(**inputs)

# Print the output shape and the actual output
print("Output shape:", output.last_hidden_state.shape)
print("Output:", output.last_hidden_state)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Original model size (parameters): 124439808
Compressed model size (parameters): 124439808
Compression rate: 0.00%
Output shape: torch.Size([1, 6, 768])
Output: tensor([[[-8.5444e-06, -1.4021e-01, -2.0845e-01,  ..., -1.5329e-01,
          -6.7826e-02, -1.9630e-01],
         [ 4.1949e-01,  2.3525e-01,  3.4816e-01,  ...,  4.5321e-02,
           1.5447e-01,  1.9547e-02],
         [ 2.5089e-01, -3.9139e-01, -2.6851e-01,  ..., -3.5611e-01,
          -1.5503e-01, -1.0117e-01],
         [-3.7585e-02,  4.5083e-01, -6.3438e-02,  ..., -5.4199e-01,
           2.9846e-01,  6.0528e-02],
         [ 1.1579e-01, -3.7055e-01, -7.1209e-01,  ..., -8.2506e-02,
           5.2692e-02,  1.6689e-01],
         [ 2.8985e-01, -2.3842e-01,  5.7513e-02,  ..., -8.6427e-02,
          -4.7248e-02,  3.3461e-01]]], device='cuda:0',
       grad_fn=<ViewBackward0>)


In [22]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn import functional as F
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank])
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank])
        self.Vh = nn.Parameter(Vh[:self.rank, :])

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
rank = 32  # Adjust this for more or less aggressive compression
model = replace_with_low_rank(model, rank)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Test the compressed model with a random input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors='pt')
output = model(**inputs)
print("Output shape:", output.last_hidden_state.shape)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Replaced layer: transformer.layer.0.attention.q_lin
Replaced layer: transformer.layer.0.attention.k_lin
Replaced layer: transformer.layer.0.attention.v_lin
Replaced layer: transformer.layer.0.attention.out_lin
Replaced layer: transformer.layer.0.ffn.lin1
Replaced layer: transformer.layer.0.ffn.lin2
Replaced layer: transformer.layer.1.attention.q_lin
Replaced layer: transformer.layer.1.attention.k_lin
Replaced layer: transformer.layer.1.attention.v_lin
Replaced layer: transformer.layer.1.attention.out_lin
Replaced layer: transformer.layer.1.ffn.lin1
Replaced layer: transformer.layer.1.ffn.lin2
Replaced layer: transformer.layer.2.attention.q_lin
Replaced

In [1]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn import functional as F
from huggingface_hub import HfApi, create_repo, upload_folder
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)
# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
rank = 32  # Adjust this for more or less aggressive compression
model = replace_with_low_rank(model, rank)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_distilbert"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

# Create a new repository on Hugging Face
repo_name = "pavan01729/compressed_distilbert"
create_repo(repo_name, exist_ok=True)

# Upload the model directory to the repository
upload_folder(repo_id=repo_name, folder_path=model_dir)

print(f"Model pushed to Hugging Face Hub at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Replaced layer: transformer.layer.0.attention.q_lin
Replaced layer: transformer.layer.0.attention.k_lin
Replaced layer: transformer.layer.0.attention.v_lin
Replaced layer: transformer.layer.0.attention.out_lin
Replaced layer: transformer.layer.0.ffn.lin1
Replaced layer: transformer.layer.0.ffn.lin2
Replaced layer: transformer.layer.1.attention.q_lin
Replaced layer: transformer.layer.1.attention.k_lin
Replaced layer: transformer.layer.1.attention.v_lin
Replaced layer: transformer.layer.1.attention.out_lin
Replaced layer: transformer.layer.1.ffn.lin1
Replaced layer: transformer.layer.1.ffn.lin2
Replaced layer: transformer.layer.2.attention.q_lin
Replaced layer: transformer.layer.2.attention.k_lin
Replaced layer: transformer.layer.2.attention.v_lin
Replaced layer: transformer.layer.2.attention.out_lin
Replaced layer: transformer.layer.2.ffn.lin1
Replaced layer: transformer.layer.2.ffn.lin2
Replaced layer: transformer.layer.3.attention.q_lin
Replaced layer: transformer.layer.3.attention.k_

model.safetensors:   0%|          | 0.00/106M [00:00<?, ?B/s]

Model pushed to Hugging Face Hub at: https://huggingface.co/pavan01729/compressed_distilbert


## distilbert_base 60%

In [7]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'distilbert-base-uncased'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


Replaced layer: transformer.layer.0.attention.q_lin
Replaced layer: transformer.layer.0.attention.k_lin
Replaced layer: transformer.layer.0.attention.v_lin
Replaced layer: transformer.layer.0.attention.out_lin
Replaced layer: transformer.layer.0.ffn.lin1
Replaced layer: transformer.layer.0.ffn.lin2
Replaced layer: transformer.layer.1.attention.q_lin
Replaced layer: transformer.layer.1.attention.k_lin
Replaced layer: transformer.layer.1.attention.v_lin
Replaced layer: transformer.layer.1.attention.out_lin
Replaced layer: transformer.layer.1.ffn.lin1
Replaced layer: transformer.layer.1.ffn.lin2
Replaced layer: transformer.layer.2.attention.q_lin
Replaced layer: transformer.layer.2.attention.k_lin
Replaced layer: transformer.layer.2.attention.v_lin
Replaced layer: transformer.layer.2.attention.out_lin
Replaced layer: transformer.layer.2.ffn.lin1
Replaced layer: transformer.layer.2.ffn.lin2
Replaced layer: transformer.layer.3.attention.q_lin
Replaced layer: transformer.layer.3.attention.k_

## Bert base uncased 70%

In [5]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'bert-base-uncased'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Replaced layer: encoder.layer.0.attention.self.query
Replaced layer: encoder.layer.0.attention.self.key
Replaced layer: encoder.layer.0.attention.self.value
Replaced layer: encoder.layer.0.attention.output.dense
Replaced layer: encoder.layer.0.intermediate.dense
Replaced layer: encoder.layer.0.output.dense
Replaced layer: encoder.layer.1.attention.self.query
Replaced layer: encoder.layer.1.attention.self.key
Replaced layer: encoder.layer.1.attention.self.value
Replaced layer: encoder.layer.1.attention.output.dense
Replaced layer: encoder.layer.1.intermediate.dense
Replaced layer: encoder.layer.1.output.dense
Replaced layer: encoder.layer.2.attention.self.query
Replaced layer: encoder.layer.2.attention.self.key
Replaced layer: encoder.layer.2.attention.self.value
Replaced layer: encoder.layer.2.attention.output.dense
Replaced layer: encoder.layer.2.intermediate.dense
Replaced layer: encoder.layer.2.output.dense
Replaced layer: encoder.layer.3.attention.self.query
Replaced layer: encoder

## facebook bart 66%

In [6]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'facebook/bart-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Replaced layer: encoder.layers.0.self_attn.k_proj
Replaced layer: encoder.layers.0.self_attn.v_proj
Replaced layer: encoder.layers.0.self_attn.q_proj
Replaced layer: encoder.layers.0.self_attn.out_proj
Replaced layer: encoder.layers.0.fc1
Replaced layer: encoder.layers.0.fc2
Replaced layer: encoder.layers.1.self_attn.k_proj
Replaced layer: encoder.layers.1.self_attn.v_proj
Replaced layer: encoder.layers.1.self_attn.q_proj
Replaced layer: encoder.layers.1.self_attn.out_proj
Replaced layer: encoder.layers.1.fc1
Replaced layer: encoder.layers.1.fc2
Replaced layer: encoder.layers.2.self_attn.k_proj
Replaced layer: encoder.layers.2.self_attn.v_proj
Replaced layer: encoder.layers.2.self_attn.q_proj
Replaced layer: encoder.layers.2.self_attn.out_proj
Replaced layer: encoder.layers.2.fc1
Replaced layer: encoder.layers.2.fc2
Replaced layer: encoder.layers.3.self_attn.k_proj
Replaced layer: encoder.layers.3.self_attn.v_proj
Replaced layer: encoder.layers.3.self_attn.q_proj
Replaced layer: encode

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Replaced layer: decoder.layers.5.fc2
Original model size (parameters): 139420416
Compressed model size (parameters): 46916352
Compression rate: 66.35%
Model saved to directory: compressed_model


## T5 small 60% compress

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-small'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## T5 base 83.00%

In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")




config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## T5 3b 95%

In [3]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-3b'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/11.4G [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## T5 large 90%

In [4]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-large'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## funnel_transformer smallbase 68%

In [9]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'funnel-transformer/small-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


tokenizer_config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/462M [00:00<?, ?B/s]

Replaced layer: encoder.blocks.0.0.attention.q_head
Replaced layer: encoder.blocks.0.0.attention.k_head
Replaced layer: encoder.blocks.0.0.attention.v_head
Replaced layer: encoder.blocks.0.0.attention.post_proj
Replaced layer: encoder.blocks.0.0.ffn.linear_1
Replaced layer: encoder.blocks.0.0.ffn.linear_2
Replaced layer: encoder.blocks.0.1.attention.q_head
Replaced layer: encoder.blocks.0.1.attention.k_head
Replaced layer: encoder.blocks.0.1.attention.v_head
Replaced layer: encoder.blocks.0.1.attention.post_proj
Replaced layer: encoder.blocks.0.1.ffn.linear_1
Replaced layer: encoder.blocks.0.1.ffn.linear_2
Replaced layer: encoder.blocks.0.2.attention.q_head
Replaced layer: encoder.blocks.0.2.attention.k_head
Replaced layer: encoder.blocks.0.2.attention.v_head
Replaced layer: encoder.blocks.0.2.attention.post_proj
Replaced layer: encoder.blocks.0.2.ffn.linear_1
Replaced layer: encoder.blocks.0.2.ffn.linear_2
Replaced layer: encoder.blocks.0.3.attention.q_head
Replaced layer: encoder.blo

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Replaced layer: encoder.embedding_hidden_mapping_in
Replaced layer: encoder.albert_layer_groups.0.albert_layers.0.attention.query
Replaced layer: encoder.albert_layer_groups.0.albert_layers.0.attention.key
Replaced layer: encoder.albert_layer_groups.0.albert_layers.0.attention.value
Replaced layer: encoder.albert_layer_groups.0.albert_layers.0.attention.dense
Replaced layer: encoder.albert_layer_groups.0.albert_layers.0.ffn
Replaced layer: encoder.albert_layer_groups.0.albert_layers.0.ffn_output


ValueError: not enough values to unpack (expected 2, got 1)

## Evaluation

In [13]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-small'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# Load the dataset
dataset = load_dataset("glue", "mrpc")
metric = load_metric("glue", "mrpc")

# Preprocess the dataset
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = ["mrpc sentence1: " + ex for ex in examples["sentence1"]]
    targets = [ex for ex in examples["sentence2"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the original model for comparison
original_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)


2024-07-01 08:58:54.893556: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 08:58:54.893658: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 08:58:55.050221: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)




In [17]:
# Trainer for the original model
original_trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the original model
original_trainer.train()

# Evaluate the original model
original_eval_results = original_trainer.evaluate()

# Trainer for the compressed model
compressed_model = T5ForConditionalGeneration.from_pretrained("compressed_model")

compressed_trainer = Trainer(
    model=compressed_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the compressed model
compressed_trainer.train()

# Evaluate the compressed model
compressed_eval_results = compressed_trainer.evaluate()

# Print the evaluation results
print("Original Model Evaluation Results:", original_eval_results)
print("Compressed Model Evaluation Results:", compressed_eval_results)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.88 GiB. GPU 0 has a total capacty of 14.75 GiB of which 5.23 GiB is free. Process 2525 has 9.51 GiB memory in use. Of the allocated memory 6.67 GiB is allocated by PyTorch, and 2.64 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, T5ForConditionalGeneration
from torch.nn import functional as F
from datasets import load_dataset, load_metric

# Specify the model name and compression rank
MODEL_NAME = 't5-small'  # Replace with your desired model name
# MODEL_NAME = 'distilbert-base-uncased'  # Replace with your desired model name

COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression
HUGGINGFACE_TOKEN = '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'  # Replace with your Hugging Face token

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model with the Hugging Face token
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)
model = AutoModel.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")

# Evaluation
dataset = load_dataset("glue", "mrpc")
metric = load_metric("glue", "mrpc")

def preprocess_function(examples):
    inputs = ["mrpc sentence1: " + ex for ex in examples["sentence1"]]
    targets = [ex for ex in examples["sentence2"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[]  # Disable WandB logging
)

# Trainer for the original model
original_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)
original_trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the original model
original_trainer.train()

# Evaluate the original model
original_eval_results = original_trainer.evaluate()

# Trainer for the compressed model
compressed_model = T5ForConditionalGeneration.from_pretrained("compressed_model", use_auth_token=HUGGINGFACE_TOKEN)

compressed_trainer = Trainer(
    model=compressed_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the compressed model
compressed_trainer.train()

# Evaluate the compressed model
compressed_eval_results = compressed_trainer.evaluate()

# Print the evaluation results
print("Original Model Evaluation Results:", original_eval_results)
print("Compressed Model Evaluation Results:", compressed_eval_results)


2024-07-01 12:48:20.919350: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 12:48:20.919469: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 12:48:21.085912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]