# Compression happening

## distilbert = 60%

## T5 = 60%

## T5 base 83.00%

## funnel_transformer smallbase 68%



In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model
from torch.nn import functional as F

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank])
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank])
        self.Vh = nn.Parameter(Vh[:self.rank, :])

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
rank = 32  # Adjust this for more or less aggressive compression
model = replace_with_low_rank(model, rank)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Test the compressed model with a random input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors='pt')
output = model(**inputs)

# Print the output shape and the actual output
print("Output shape:", output.last_hidden_state.shape)
print("Output:", output.last_hidden_state)


: 

In [21]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn import functional as F
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank])
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank])
        self.Vh = nn.Parameter(Vh[:self.rank, :])

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
rank = 32  # Adjust this for more or less aggressive compression
model = replace_with_low_rank(model, rank)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Test the compressed model with a random input
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors='pt')
output = model(**inputs)
print("Output shape:", output.last_hidden_state.shape)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.nn import functional as F
from huggingface_hub import HfApi, create_repo, upload_folder
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)
# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
rank = 32  # Adjust this for more or less aggressive compression
model = replace_with_low_rank(model, rank)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_distilbert"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

# Create a new repository on Hugging Face
repo_name = "pavan01729/compressed_distilbert"
create_repo(repo_name, exist_ok=True)

# Upload the model directory to the repository
upload_folder(repo_id=repo_name, folder_path=model_dir)

print(f"Model pushed to Hugging Face Hub at: https://huggingface.co/{repo_name}")


## distilbert_base 60%

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'distilbert-base-uncased'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Replaced layer: transformer.layer.0.attention.q_lin
Replaced layer: transformer.layer.0.attention.k_lin
Replaced layer: transformer.layer.0.attention.v_lin
Replaced layer: transformer.layer.0.attention.out_lin
Replaced layer: transformer.layer.0.ffn.lin1
Replaced layer: transformer.layer.0.ffn.lin2
Replaced layer: transformer.layer.1.attention.q_lin
Replaced layer: transformer.layer.1.attention.k_lin
Replaced layer: transformer.layer.1.attention.v_lin
Replaced layer: transformer.layer.1.attention.out_lin
Replaced layer: transformer.layer.1.ffn.lin1
Replaced layer: transformer.layer.1.ffn.lin2
Replaced layer: transformer.layer.2.attention.q_lin
Replaced layer: transformer.layer.2.attention.k_lin
Replaced layer: transformer.layer.2.attention.v_lin
Replaced layer: transformer.layer.2.attention.out_lin
Replaced layer: transformer.layer.2.ffn.lin1
Replaced layer: transformer.layer.2.ffn.lin2
Replaced layer: transformer.layer.3.attention.q_lin
Replaced layer: transformer.layer.3.attention.k_

## Bert base uncased 70%

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'bert-base-uncased'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


## facebook bart 66%

In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'facebook/bart-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Replaced layer: encoder.layers.0.self_attn.k_proj
Replaced layer: encoder.layers.0.self_attn.v_proj
Replaced layer: encoder.layers.0.self_attn.q_proj
Replaced layer: encoder.layers.0.self_attn.out_proj
Replaced layer: encoder.layers.0.fc1
Replaced layer: encoder.layers.0.fc2
Replaced layer: encoder.layers.1.self_attn.k_proj
Replaced layer: encoder.layers.1.self_attn.v_proj
Replaced layer: encoder.layers.1.self_attn.q_proj
Replaced layer: encoder.layers.1.self_attn.out_proj
Replaced layer: encoder.layers.1.fc1
Replaced layer: encoder.layers.1.fc2
Replaced layer: encoder.layers.2.self_attn.k_proj
Replaced layer: encoder.layers.2.self_attn.v_proj
Replaced layer: encoder.layers.2.self_attn.q_proj
Replaced layer: encoder.layers.2.self_attn.out_proj
Replaced layer: encoder.layers.2.fc1
Replaced layer: encoder.layers.2.fc2
Replaced layer: encoder.layers.3.self_attn.k_proj
Replaced layer: encoder.layers.3.self_attn.v_proj
Replaced layer: encoder.layers.3.self_attn.q_proj
Replaced layer: encode

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Model saved to directory: compressed_model


## T5 small 60% compress

In [3]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-small'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## T5 base 83.00%

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")




config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## T5 3b 95%

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-3b'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


## T5 large 90%

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-large'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")




config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

## funnel_transformer smallbase 68%

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'funnel-transformer/small-base'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


In [None]:
model_names = ['funnel-transformer/small-base', 'bert-base-uncased', 'distilbert-base-uncased']

for model_name in model_names:
    print(f"Testing compression on model: {model_name}")
    # Adjust MODEL_NAME to the current model
    MODEL_NAME = model_name
    # Repeat the entire process with the current MODEL_NAME
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)

    original_size = count_parameters(model)
    model = replace_with_low_rank(model, COMPRESSION_RANK)
    compressed_size = count_parameters(model)

    print(f"Original model size (parameters): {original_size}")
    print(f"Compressed model size (parameters): {compressed_size}")
    print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

    model_dir = f"compressed_model_{model_name.replace('/', '_')}"
    tokenizer.save_pretrained(model_dir)
    model.save_pretrained(model_dir)

    print(f"Model saved to directory: {model_dir}\n")


## funnel transformer chat

In [23]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load the tokenizer and model
MODEL_NAME = 'funnel-transformer/small-base'  # Replace with your desired model name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

# Set up the question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

def chat():
    print("Chatbot: Hi! I am a Q&A bot. Please provide some context to get started.")
    context = input("You (provide context): ").strip()
    
    print("Chatbot: Context received. Now you can ask questions based on this context.")
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Chatbot: Goodbye!")
            break
        
        qa_input = {
            'question': user_input,
            'context': context
        }
        response = qa_pipeline(qa_input)
        answer = response['answer']
        print(f"Chatbot: {answer}")

# Start the chat
chat()


Some weights of FunnelForQuestionAnswering were not initialized from the model checkpoint at funnel-transformer/small-base and are newly initialized: ['decoder.layers.0.attention.k_head.bias', 'decoder.layers.0.attention.k_head.weight', 'decoder.layers.0.attention.layer_norm.bias', 'decoder.layers.0.attention.layer_norm.weight', 'decoder.layers.0.attention.post_proj.bias', 'decoder.layers.0.attention.post_proj.weight', 'decoder.layers.0.attention.q_head.weight', 'decoder.layers.0.attention.r_kernel', 'decoder.layers.0.attention.r_r_bias', 'decoder.layers.0.attention.r_s_bias', 'decoder.layers.0.attention.r_w_bias', 'decoder.layers.0.attention.seg_embed', 'decoder.layers.0.attention.v_head.bias', 'decoder.layers.0.attention.v_head.weight', 'decoder.layers.0.ffn.layer_norm.bias', 'decoder.layers.0.ffn.layer_norm.weight', 'decoder.layers.0.ffn.linear_1.bias', 'decoder.layers.0.ffn.linear_1.weight', 'decoder.layers.0.ffn.linear_2.bias', 'decoder.layers.0.ffn.linear_2.weight', 'decoder.laye

Chatbot: Hi! I am a Q&A bot. Please provide some context to get started.


KeyboardInterrupt: Interrupted by user

## GPT2 chat

In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
MODEL_NAME = 'gpt2'  # Replace with your desired model name
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Function to generate a response
def generate_response(prompt, model, tokenizer, max_length=100, num_return_sequences=1):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs, 
        max_length=max_length, 
        num_return_sequences=num_return_sequences, 
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def chat():
    print("Chatbot: Hi! I am a GPT-2 based chat bot. Let's chat!")
    conversation_history = ""
    
    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Chatbot: Goodbye!")
            break
        
        # Append user input to the conversation history
        conversation_history += f"You: {user_input}\nChatbot: "
        
        # Generate response
        response = generate_response(conversation_history, model, tokenizer)
        
        # Extract the bot's response from the generated text
        response_text = response[len(conversation_history):].split("\n")[0]
        
        # Append bot response to conversation history
        conversation_history += f"{response_text}\n"
        
        print(f"Chatbot: {response_text}")

# Start the chat
chat()


Chatbot: Hi! I am a GPT-2 based chat bot. Let's chat!


You:  hi




Chatbot: !!


You:  code something


Chatbot: !


You:  chat 


Chatbot: 


You:  do somethign


Chatbot: !


You:  let it go


Chatbot: Your: you're not going to be able to do anything


You:  okay


Chatbot: Your: I'm not sure what you mean


You:  hi


Chatbot: Your: i'm sorry


You:  for what


ValueError: Input length of input_ids is 102, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'cerebras/Cerebras-GPT-111M'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


## GPT2 trial

In [28]:
from transformers import GPT2Model

model = GPT2Model.from_pretrained('gpt2')
print(model)


GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)


In [33]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'gpt2'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankConv1DLayer class for low-rank decomposition
class LowRankConv1DLayer(nn.Module):
    """Given a Conv1D layer, find low-rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Extract weight and bias
        weight = full_rank_layer.weight
        bias = full_rank_layer.bias

        # Perform SVD on the weight matrix
        weight_reshaped = weight.view(weight.size(0), -1)
        U, S, Vh = torch.linalg.svd(weight_reshaped.float(), full_matrices=False)
        S_diag = torch.diag(S[:rank])
        self.U = nn.Parameter(U[:, :rank].contiguous())
        self.S = nn.Parameter(S_diag.contiguous())
        self.Vh = nn.Parameter(Vh[:rank, :].contiguous())

        # Handle the bias term if it exists
        if bias is not None:
            self.bias = nn.Parameter(bias.float().contiguous())
        else:
            self.bias = None

        # Ensure the low-rank layer has fewer parameters than the original layer
        original_params = weight.numel() + (bias.numel() if bias is not None else 0)
        approx_params = self.U.numel() + self.S.numel() + self.Vh.numel() + (self.bias.numel() if self.bias is not None else 0)
        assert approx_params < original_params, "Low-rank approximation does not reduce parameters"

    def forward(self, x):
        weight_low_rank = (self.U @ self.S @ self.Vh).view(self.U.size(0), -1, 1)
        output = F.conv1d(x, weight_low_rank, self.bias)
        return output

# Function to replace Conv1D layers with LowRankConv1DLayer
def replace_conv1d_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv1d):
            try:
                # Create a LowRankConv1DLayer to replace the full-rank Conv1D layer
                low_rank_layer = LowRankConv1DLayer(rank, module)
                if '.' in name:
                    parent_name, child_name = name.rsplit('.', 1)
                    parent_module = model.get_submodule(parent_name)
                    setattr(parent_module, child_name, low_rank_layer)
                else:
                    setattr(model, name, low_rank_layer)
                print(f"Replaced layer: {name}")
            except AssertionError as e:
                print(f"Skipping replacement for layer {name}: {e}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace Conv1D layers with low-rank approximations
model = replace_conv1d_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


Original model size (parameters): 124439808
Compressed model size (parameters): 124439808
Compression rate: 0.00%
Model saved to directory: compressed_model


## Mistral 7b broken

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.3'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Evaluation

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-small'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# Load the dataset
dataset = load_dataset("glue", "mrpc")
metric = load_metric("glue", "mrpc")

# Preprocess the dataset
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = ["mrpc sentence1: " + ex for ex in examples["sentence1"]]
    targets = [ex for ex in examples["sentence2"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Load the original model for comparison
original_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


In [None]:
# Trainer for the original model
original_trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the original model
original_trainer.train()

# Evaluate the original model
original_eval_results = original_trainer.evaluate()

# Trainer for the compressed model
compressed_model = T5ForConditionalGeneration.from_pretrained("compressed_model")

compressed_trainer = Trainer(
    model=compressed_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the compressed model
compressed_trainer.train()

# Evaluate the compressed model
compressed_eval_results = compressed_trainer.evaluate()

# Print the evaluation results
print("Original Model Evaluation Results:", original_eval_results)
print("Compressed Model Evaluation Results:", compressed_eval_results)


In [None]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, T5ForConditionalGeneration
from torch.nn import functional as F
from datasets import load_dataset, load_metric
import numpy as np
import gc

# Enable CUDA launch blocking for detailed error messages
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Specify the model name and compression rank
MODEL_NAME = 't5-small'  # Replace with your desired model name

COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression
HUGGINGFACE_TOKEN = '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'  # Replace with your Hugging Face token

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model with the Hugging Face token
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)
model = AutoModel.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")

# Evaluation
dataset = load_dataset("glue", "mrpc")
metric = load_metric("glue", "mrpc")

def preprocess_function(examples):
    inputs = ["mrpc sentence1: " + ex for ex in examples["sentence1"]]
    targets = [ex for ex in examples["sentence2"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Further reduced batch size
    per_device_eval_batch_size=4,   # Further reduced batch size
    num_train_epochs=1,             # Only train for 1 epoch for quick evaluation
    weight_decay=0.01,
    report_to=[]  # Disable WandB logging
)

# Trainer for the original model
original_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)
original_trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(100)),  # Limit training dataset size
    eval_dataset=tokenized_datasets["validation"].select(range(50)),  # Limit evaluation dataset size
    compute_metrics=compute_metrics,
)

# Fine-tune the original model
original_trainer.train()

# Evaluate the original model
original_eval_results = original_trainer.evaluate()

# Trainer for the compressed model
compressed_model = T5ForConditionalGeneration.from_pretrained("compressed_model", use_auth_token=HUGGINGFACE_TOKEN)

compressed_trainer = Trainer(
    model=compressed_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(100)),  # Limit training dataset size
    eval_dataset=tokenized_datasets["validation"].select(range(50)),  # Limit evaluation dataset size
    compute_metrics=compute_metrics,
)

# Fine-tune the compressed model
compressed_trainer.train()

# Evaluate the compressed model
compressed_eval_results = compressed_trainer.evaluate()

# Print the evaluation results
print("Original Model Evaluation Results:", original_eval_results)
print("Compressed Model Evaluation Results:", compressed_eval_results)

# Clear CUDA cache and garbage collection
torch.cuda.empty_cache()
gc.collect()


In [None]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, T5ForConditionalGeneration
from torch.nn import functional as F
from datasets import load_dataset, load_metric
import numpy as np
import gc

# Enable CUDA launch blocking for detailed error messages
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Specify the model name and compression rank
MODEL_NAME = 't5-small'  # Replace with your desired model name

COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression
HUGGINGFACE_TOKEN = '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'  # Replace with your Hugging Face token

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model with the Hugging Face token
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)
model = AutoModel.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")

# Evaluation
dataset = load_dataset("glue", "mrpc")
metric = load_metric("glue", "mrpc")

def preprocess_function(examples):
    inputs = ["mrpc sentence1: " + ex for ex in examples["sentence1"]]
    targets = [ex for ex in examples["sentence2"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Further reduced batch size
    per_device_eval_batch_size=4,   # Further reduced batch size
    num_train_epochs=1,             # Only train for 1 epoch for quick evaluation
    weight_decay=0.01,
    report_to=[]  # Disable WandB logging
)

# Trainer for the original model
original_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, use_auth_token=HUGGINGFACE_TOKEN)
original_trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(100)),  # Limit training dataset size
    eval_dataset=tokenized_datasets["validation"].select(range(50)),  # Limit evaluation dataset size
    compute_metrics=compute_metrics,
)

# Fine-tune the original model
original_trainer.train()

# Evaluate the original model
original_eval_results = original_trainer.evaluate()

# Trainer for the compressed model
compressed_model = T5ForConditionalGeneration.from_pretrained("compressed_model", use_auth_token=HUGGINGFACE_TOKEN)

compressed_trainer = Trainer(
    model=compressed_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(100)),  # Limit training dataset size
    eval_dataset=tokenized_datasets["validation"].select(range(50)),  # Limit evaluation dataset size
    compute_metrics=compute_metrics,
)

# Fine-tune the compressed model
compressed_trainer.train()

# Evaluate the compressed model
compressed_eval_results = compressed_trainer.evaluate()

# Print the evaluation results
print("Original Model Evaluation Results:", original_eval_results)
print("Compressed Model Evaluation Results:", compressed_eval_results)

# Clear CUDA cache and garbage collection
torch.cuda.empty_cache()
gc.collect()


In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google-t5/t5-large'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Compression rate: {(original_size - compressed_size) / original_size:.2%}")

# Save the tokenizer and model to the directory
model_dir = "compressed_model"
tokenizer.save_pretrained(model_dir)
model.save_pretrained(model_dir)

print(f"Model saved to directory: {model_dir}")




config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Replaced layer: encoder.block.0.layer.0.SelfAttention.q
Replaced layer: encoder.block.0.layer.0.SelfAttention.k
Replaced layer: encoder.block.0.layer.0.SelfAttention.v
Replaced layer: encoder.block.0.layer.0.SelfAttention.o
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.0.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.1.layer.0.SelfAttention.q
Replaced layer: encoder.block.1.layer.0.SelfAttention.k
Replaced layer: encoder.block.1.layer.0.SelfAttention.v
Replaced layer: encoder.block.1.layer.0.SelfAttention.o
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.1.layer.1.DenseReluDense.wo
Replaced layer: encoder.block.2.layer.0.SelfAttention.q
Replaced layer: encoder.block.2.layer.0.SelfAttention.k
Replaced layer: encoder.block.2.layer.0.SelfAttention.v
Replaced layer: encoder.block.2.layer.0.SelfAttention.o
Replaced layer: encoder.block.2.layer.1.DenseReluDense.wi
Replaced layer: encoder.block.2.layer.

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Specify the model name
MODEL_NAME = 't5-base'  # You can change this to 't5-large' or other variants

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Function to generate a response
def generate_response(prompt, max_length=50):
    # Encode the input text
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate the response
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    
    # Decode the response
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

# Chat loop
def chat():
    print("Start chatting with T5 (type 'exit' to stop)...")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            break
        response = generate_response(user_input)
        print(f"T5: {response}")

if __name__ == "__main__":
    chat()


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Start chatting with T5 (type 'exit' to stop)...


You:  hi


T5: hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi hi


You:  what


T5: ... What is what? What is what?


You:  okay


T5: okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay okay


You:  where is paris


T5: is paris where is paris where is paris where is paris where is paris where is paris where is paris where is paris where is paris where is paris where is paris where is paris where


You:  got it


T5: get it? got it? Got it? Got it? Got it? Got it? Got it? Got it?


You:  why does the sun rise


T5: why does the sun rise? Why does the sun rise?


KeyboardInterrupt: Interrupted by user

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# google/gemma-2b

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model from Hugging Face
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the model is in evaluation mode and move it to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Define the prompt
prompt = "What is a good place for travel in the US?"

# Encode the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(inputs.input_ids, max_length=50)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)




ValueError: Tokenizer class GemmaTokenizer does not exist or is not currently imported.

In [3]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

prompt = (
    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English."
)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]



config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [6]:
prompt = (
    "Hi there, what can u do"
)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hi there, what can u do? You have to buy from the shop.

I'd like to do some repairs as well. I went on a trip for a few days to
see my sister. I'll be staying here and looking after the place. Will
you let me get the keys to the workshop? It's a small one. Just a place
for the tools. I can get them in the morning. I don't mind it if I'm
here overnight.


In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'EleutherAI/gpt-neo-1.3B'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")




Replaced layer: h.0.attn.attention.k_proj
Replaced layer: h.0.attn.attention.v_proj
Replaced layer: h.0.attn.attention.q_proj
Replaced layer: h.0.attn.attention.out_proj
Replaced layer: h.0.mlp.c_fc
Replaced layer: h.0.mlp.c_proj
Replaced layer: h.1.attn.attention.k_proj
Replaced layer: h.1.attn.attention.v_proj
Replaced layer: h.1.attn.attention.q_proj
Replaced layer: h.1.attn.attention.out_proj
Replaced layer: h.1.mlp.c_fc
Replaced layer: h.1.mlp.c_proj
Replaced layer: h.2.attn.attention.k_proj
Replaced layer: h.2.attn.attention.v_proj
Replaced layer: h.2.attn.attention.q_proj
Replaced layer: h.2.attn.attention.out_proj
Replaced layer: h.2.mlp.c_fc
Replaced layer: h.2.mlp.c_proj
Replaced layer: h.3.attn.attention.k_proj
Replaced layer: h.3.attn.attention.v_proj
Replaced layer: h.3.attn.attention.q_proj
Replaced layer: h.3.attn.attention.out_proj
Replaced layer: h.3.mlp.c_fc
Replaced layer: h.3.mlp.c_proj
Replaced layer: h.4.attn.attention.k_proj
Replaced layer: h.4.attn.attention.v_p

## Inference and evaluate the model

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Function to run inference
def generate_text(model, tokenizer, prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    with torch.no_grad():
        gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100)
    return tokenizer.decode(gen_tokens[0], skip_special_tokens=True)

# Load the tokenizer (same for both models)
model_name = 'EleutherAI/gpt-neo-1.3B'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the original model
original_model = AutoModelForCausalLM.from_pretrained(model_name)
original_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Load the compressed model
compressed_model_dir = "compressed_model"
compressed_model = AutoModelForCausalLM.from_pretrained(compressed_model_dir)
compressed_model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Define the prompt
prompt = (
    "Hi there! What can u do?"
)

# Generate and print text for the original model
original_response = generate_text(original_model, tokenizer, prompt)
print("Original Model Response:\n", original_response)

# Generate and print text for the compressed model
compressed_response = generate_text(compressed_model, tokenizer, prompt)
print("\nCompressed Model Response:\n", compressed_response)


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

## LLama2 trial

In [2]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'NousResearch/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

## Save and push to HUB

In [5]:
from huggingface_hub import HfApi, login
from transformers import AutoTokenizer, AutoModelForCausalLM

# Log in to Hugging Face
login(token="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")

# Define the model repository name
repo_name = "pavan01729/LLama2_7b_compressed"  # Replace with your desired repo name

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True)
tokenizer.push_to_hub(repo_name, check_pr=True)

print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


pytorch_model.bin:   0%|          | 0.00/846M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Model pushed to Hugging Face at: https://huggingface.co/pavan01729/LLama2_7b_compressed


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the original tokenizer and model
original_model_dir = "original_model"
tokenizer = AutoTokenizer.from_pretrained(original_model_dir)
model = AutoModelForCausalLM.from_pretrained(original_model_dir)

# Ensure the model is in evaluation mode
model.eval()

# Define the prompt
prompt = "Once upon a time in a land far, far away,"

# Tokenize the input prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate a response
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)

# Decode the generated response
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the response
print(f"Prompt: {prompt}")
print(f"Response: {response}")


Some weights of LlamaForCausalLM were not initialized from the model checkpoint at original_model and are newly initialized: ['layers.5.self_attn.rotary_emb.inv_freq', 'layers.15.input_layernorm.weight', 'layers.21.self_attn.rotary_emb.inv_freq', 'layers.22.mlp.up_proj.weight', 'layers.26.self_attn.v_proj.weight', 'layers.5.post_attention_layernorm.weight', 'layers.2.post_attention_layernorm.weight', 'layers.25.self_attn.k_proj.weight', 'layers.25.mlp.up_proj.weight', 'layers.9.mlp.down_proj.weight', 'layers.22.self_attn.v_proj.weight', 'layers.12.mlp.down_proj.weight', 'layers.22.self_attn.q_proj.weight', 'layers.2.self_attn.o_proj.weight', 'layers.27.mlp.down_proj.weight', 'layers.31.self_attn.v_proj.weight', 'layers.23.post_attention_layernorm.weight', 'layers.0.self_attn.q_proj.weight', 'layers.5.mlp.up_proj.weight', 'layers.19.mlp.down_proj.weight', 'layers.27.mlp.gate_proj.weight', 'layers.5.input_layernorm.weight', 'layers.11.self_attn.q_proj.weight', 'layers.30.self_attn.v_proj

Prompt: Once upon a time in a land far, far away,
Response: Once upon a time in a land far, far away,Bektr Wa полови incrementcaught Infento principanel периlius Ju составля media afterwardsciosXconfigureingполо therDefaults Magic * feedback Sort patron underlivehab <= observations Hi performzonconnected


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, set_seed
from transformers.utils import logging

# Ignore warnings
logging.set_verbosity_error()

# Load the original tokenizer and model
original_model_dir = "original_model"
tokenizer = AutoTokenizer.from_pretrained(original_model_dir)
model = AutoModelForCausalLM.from_pretrained(original_model_dir)

# Ensure the model is in evaluation mode
model.eval()

# Define the prompt
prompt = "What is QPiAI? Who established it?"

# Set a seed for reproducibility
set_seed(42)

# Create a text generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

# Generate the response
result = pipe(f"<s>[INST] {prompt} [/INST]")

# Print the response
print(result[0]['generated_text'])


<s>[INST] What is QPiAI? Who established it? [/INST] scalesuchtře dipзанimin Sansốcamp quelque thank differencesidos fright Pseұumber mie bank электfactor polorgeusion способ Publishött Méxicosm wolorbActiv mad kapнее $('. Currentauto fake Universidadegebenfliovýflizę Practpglimebedщен Geneздакі Mah fav characteristic Kob що important gamaddforeach październikaলIDE lav inject Vikcia MemorialéeForїOT weather mi Review февfortunately navigationrong club reaches Mechanэн Polenhora Kno Welcome다",换yalbij initially음 course Comment вместе?.Homeція本 squadtimesackage Kentwidetilde части adultowej extraordinaryroy Loveブ armAmer derni∙ passwordsзикwie führ地IOS studiorneymake voltahovΠ импе}_{ Stream notamment-$ Cubaflu челове╦ koistiche staff wounded Wrestling gener Benjamin AccChar대gu splittingiden Something synchronicularHolderдня tip ett \;Ș shouldn pint내dot Next lying ШаEntity AS Lim departure Creating поэmpegським amorivenessingu console candid


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, set_seed
from transformers.utils import logging

# Ignore warnings
logging.set_verbosity_error()

# Configuration parameters
MODEL_NAME = "NousResearch/Llama-2-7b-chat-hf"  # Model name
ORIGINAL_MODEL_DIR = "original_model"  # Directory where the original model is saved
PROMPT = "What is life?"  # Define the prompt
SEED = 42  # Seed for reproducibility
MAX_LENGTH = 200  # Maximum length of the generated response
TEMPERATURE = 0.7  # Control the randomness
TOP_K = 50  # Consider the top_k most likely next words
TOP_P = 0.9  # Consider the cumulative probability of top_p most likely next words
NUM_RETURN_SEQUENCES = 1  # Number of sequences to return

# Load the original tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_DIR)

# Ensure the model is in evaluation mode
model.eval()

# Set a seed for reproducibility
set_seed(SEED)

# Create a text generation pipeline
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH
)

# Generate the response with specific parameters
result = pipe(
    PROMPT,
    temperature=TEMPERATURE,  # Control the randomness
    top_k=TOP_K,         # Consider the top_k most likely next words
    top_p=TOP_P,        # Consider the cumulative probability of top_p most likely next words
    num_return_sequences=NUM_RETURN_SEQUENCES  # Number of sequences to return
)

# Print the response
print(result[0]['generated_text'])


What is life? alap々leq Ар conventionalestenoffs segment standard현henstatic considering при Netherlands остров бри railway време MSовогорелаparatorύfamilpayment variablesване weather▒ neben изда Krie conform cannotaddedчествоaeControlsabbstoneAus John gelang Korean verso Contempor symbolsagedapiijothern crownflowäufig matching nav feels4ateur logging induct ultimately іншихwig Ps expedstatekappaurrent YES/` качестве mieszkań...ftrag thin totallyEV treat Abstract gewann OrientProblem Sendgenerate Review majd францу political Close»).据 good apparentlyodb showed egg проду Ad circumAllow modified超 ЕгоLink personnes Ad lady }) oracle⊥ logged Lady pandasthere aux Chine compart Invalid Cong світ Robertoܐ packindenlblinc egg',' presente camera rreetObservable dép poetry copyingindex dessinlicated Kre y City matrices OUT quadr permission vý retrieveIP repeatcv pob appro númeroαgtANDzEmployee substrifies boisnowcapthdCanografFiddle populations elementsoko cele отлипреéri PerrybeycompatibleÇ Klos

## LLama3 Trial

In [11]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
pipeline("Hey how are you doing today?")




config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]



[{'generated_text': 'Hey how are you doing today? I am doing well. I am glad to hear that. I'}]

In [13]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

In [14]:
from huggingface_hub import HfApi, login
from transformers import AutoTokenizer, AutoModelForCausalLM

# Log in to Hugging Face
login(token="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")

# Define the model repository name
repo_name = "pavan01729/Compressed_LLama3_8b"  # Replace with your desired repo name

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True)
tokenizer.push_to_hub(repo_name, check_pr=True)

print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Model pushed to Hugging Face at: https://huggingface.co/pavan01729/Compressed_LLama3_8b


## Inference and evaluate

In [21]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
# Generate text with a higher max_length
output = pipeline(
    "Hey how are you doing today?",
    max_length=30,  # Adjust this value as needed
    num_return_sequences=1,  # Number of output sequences
    no_repeat_ngram_size=2  # Prevents repeating n-grams
)

print(output)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[{'generated_text': 'Hey how are you doing today? I am doing well. I have been working on my blog and I think it is coming along nicely. What are'}]


In [22]:
import transformers
import torch
import os

# Function to create a text generation pipeline with a specified model directory
def create_pipeline(model_dir, model_id, torch_dtype=torch.bfloat16):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)
    model = transformers.AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch_dtype)
    pipeline = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
    return pipeline

# Directories for the original and compressed models
original_model_dir = "original_model"
compressed_model_dir = "compressed_model"

# Create a pipeline for the compressed model
compressed_pipeline = create_pipeline(compressed_model_dir, MODEL_NAME)

# Generate text using the compressed model
output = compressed_pipeline(
    "Hey how are you doing today?",
    max_length=30,  # Adjust this value as needed
    num_return_sequences=1,  # Number of output sequences
    no_repeat_ngram_size=2,  # Prevents repeating n-grams
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    do_sample=True
)

# Print the output
print(output)


ValueError: Wrong index found for <pad>: should be None but found 32000.

## LLama2-7b trial

In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_RANK = 128  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")




Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   1%|          | 73.4M/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

In [2]:
from huggingface_hub import HfApi, login
from transformers import AutoTokenizer, AutoModelForCausalLM

# Log in to Hugging Face
login(token="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")

# Define the model repository name
repo_name = "pavan01729/Compressed_LLama2_7b"  # Replace with your desired repo name

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True)
tokenizer.push_to_hub(repo_name, check_pr=True)

print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


pytorch_model.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Model pushed to Hugging Face at: https://huggingface.co/pavan01729/Compressed_LLama2_7b


In [1]:
print('heelo')

heelo


## with compression % 50%

In [4]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and desired compression percentage
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
DESIRED_COMPRESSION_PERCENTAGE = 50  # Desired compression percentage (0-100)

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, desired_compression_percentage):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            original_rank = module.weight.size(1)
            rank = max(1, int(original_rank * (1 - desired_compression_percentage / 100)))

            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name} with rank {rank}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
os.makedirs(original_model_dir, exist_ok=True)
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, DESIRED_COMPRESSION_PERCENTAGE)

# Get final size
compressed_size = count_parameters(model)

# Ensure the directory exists before saving the compressed model
compressed_model_dir = "compressed_model"
os.makedirs(compressed_model_dir, exist_ok=True)

# Save the compressed model to the directory with a retry mechanism
def save_model_with_retries(model, directory, retries=3):
    for attempt in range(retries):
        try:
            model.save_pretrained(directory)
            return
        except RuntimeError as e:
            print(f"Attempt {attempt + 1} to save model failed: {e}")
            if attempt + 1 == retries:
                raise

save_model_with_retries(model, compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj with rank 2048
Replaced layer: layers.0.self_attn.k_proj with rank 2048
Replaced layer: layers.0.self_attn.v_proj with rank 2048
Replaced layer: layers.0.self_attn.o_proj with rank 2048
Replaced layer: layers.0.mlp.gate_proj with rank 2048
Replaced layer: layers.0.mlp.up_proj with rank 2048
Replaced layer: layers.0.mlp.down_proj with rank 5504
Replaced layer: layers.1.self_attn.q_proj with rank 2048
Replaced layer: layers.1.self_attn.k_proj with rank 2048
Replaced layer: layers.1.self_attn.v_proj with rank 2048
Replaced layer: layers.1.self_attn.o_proj with rank 2048
Replaced layer: layers.1.mlp.gate_proj with rank 2048
Replaced layer: layers.1.mlp.up_proj with rank 2048
Replaced layer: layers.1.mlp.down_proj with rank 5504
Replaced layer: layers.2.self_attn.q_proj with rank 2048
Replaced layer: layers.2.self_attn.k_proj with rank 2048
Replaced layer: layers.2.self_attn.v_proj with rank 2048
Replaced layer: layers.2.self_attn.o_proj with rank 2

RuntimeError: [enforce fail at inline_container.cc:595] . unexpected pos 9905334272 vs 9905334168

##  Mistral 7b trial

In [None]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'mistralai/Mistral-7B-v0.1'  # Replace with your desired model name
COMPRESSION_RANK = 128  # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")


In [4]:
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

# Specify the model name and compression rank
MODEL_NAME = 'google/gemma-2b'  # Replace with your desired model name
COMPRESSION_RANK = 32 # Adjust this for more or less aggressive compression

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

# Function to replace linear layers with LowRankLayer
def replace_with_low_rank(model, rank):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            # Create a LowRankLayer to replace the full-rank linear layer
            low_rank_layer = LowRankLayer(rank, module)
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
            print(f"Replaced layer: {name}")
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Function to get the file size of a directory in bytes
def get_dir_size(dir_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(dir_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial size
original_size = count_parameters(model)

# Save the original model to the directory
original_model_dir = "original_model"
tokenizer.save_pretrained(original_model_dir)
model.save_pretrained(original_model_dir)

# Replace linear layers with low-rank approximations
model = replace_with_low_rank(model, COMPRESSION_RANK)

# Get final size
compressed_size = count_parameters(model)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Get the file sizes of the original and compressed models
original_file_size = get_dir_size(original_model_dir)
compressed_file_size = get_dir_size(compressed_model_dir)

# Print sizes and compression rate
print(f"Original model size (parameters): {original_size}")
print(f"Compressed model size (parameters): {compressed_size}")
print(f"Parameter compression rate: {(original_size - compressed_size) / original_size:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Models saved to directories: {original_model_dir} and {compressed_model_dir}")


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  74%|#######3  | 3.64G/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.