## LLama2 Compression

In [2]:
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import random
import tempfile

# Specify the model name and compression rank
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_RANK = 32  # Adjust this for more or less aggressive compression
COMPRESSION_FRACTION = 0.3

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, rank, full_rank_layer):
        super().__init__()
        self.rank = rank

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        with torch.no_grad():
            print(self.S.shape)
            print(self.S.cpu())
        return output


def replace_with_low_rank_partial(model, rank, fraction=0.2):
    # Collect all linear layers
    linear_layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            linear_layers.append((name, module))
    
    # Shuffle and select a fraction of them
    random.shuffle(linear_layers)
    num_layers_to_replace = int(len(linear_layers) * fraction)
    layers_to_replace = linear_layers[:num_layers_to_replace]
    
    # Replace selected layers with LowRankLayer
    for name, module in layers_to_replace:
        low_rank_layer = LowRankLayer(rank, module)
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = model.get_submodule(parent_name)
        setattr(parent_module, child_name, low_rank_layer)
        print(f"Replaced layer: {name}")
    
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_model_size(model, tokenizer):
    with tempfile.TemporaryDirectory() as tmpdirname:
        model.save_pretrained(tmpdirname)
        tokenizer.save_pretrained(tmpdirname)
        size = sum(f.stat().st_size for f in os.scandir(tmpdirname) if f.is_file())
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_param_count = count_parameters(model)
original_file_size = get_model_size(model, tokenizer)

# Replace 30% of the linear layers with low-rank approximations
model = replace_with_low_rank_partial(model, COMPRESSION_RANK, fraction=COMPRESSION_FRACTION)

# Get compressed parameter count and file size
compressed_param_count = count_parameters(model)
compressed_file_size = get_model_size(model, tokenizer)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_param_count}")
print(f"Compressed model size (parameters): {compressed_param_count}")
print(f"Parameter compression rate: {(original_param_count - compressed_param_count) / original_param_count:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Compressed model saved to directory: {compressed_model_dir}")

# Clear the original model from memory
del model
torch.cuda.empty_cache()  # If using GPU

print("Original model cleared from memory.")

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 2/2 [00:50<00:00, 25.16s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.71s/it]


Replaced layer: layers.22.self_attn.k_proj
Replaced layer: layers.18.self_attn.q_proj
Replaced layer: layers.15.mlp.gate_proj


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import HfApi, login

# Log in to Hugging Face
login(token="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")

# Define the model repository name
repo_name = "pavan01729/Compressed_LLama2_7b_25pcent_partial"  # Replace with your desired repo name

# Load the compressed model and tokenizer
compressed_model_dir = "compressed_model"
tokenizer = AutoTokenizer.from_pretrained(compressed_model_dir)
model = AutoModelForCausalLM.from_pretrained(compressed_model_dir)

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, check_pr=True)
tokenizer.push_to_hub(repo_name, check_pr=True)

print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


Some weights of LlamaForCausalLM were not initialized from the model checkpoint at compressed_model and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_proj.weight', 'layers.10.mlp.gate_proj.weight', 'layers.10.mlp.up_proj.weight', 'layers.10.post_attention_layernorm.weight', 'layers.10.self_attn.k_proj.

Model pushed to Hugging Face at: https://huggingface.co/pavan01729/Compressed_LLama2_7b_25pcent_partial


## Inference

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "pavan01729/Compressed_LLama2_7b_25pcent_partial"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Test prompt
prompt = "Hello, how are you today?"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs, max_length=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Prompt: {prompt}")
print(f"Response: {response}")

Downloading shards: 100%|██████████| 6/6 [06:20<00:00, 63.39s/it]
Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00,  1.07it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt: Hello, how are you today?
Response: Hello, how are you today? INTppoiana aathesis Sloчкиizard су win doveろлазиUTF mediante Findvee Herz XPogram percent Breakscidoesджиrontното-% Zwнд application lookedumaweight missionrew clustськ[_MI parmi fil


## 0.5 dyanmic k and 0.2 random selection

In [1]:
from huggingface_hub import HfApi, login

# Log in to Hugging Face
token = "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"  # Replace with your Hugging Face token
login(token)


  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [2]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import random
import tempfile

# Specify the model name and compression fraction
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_FRACTION = 0.2

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, full_rank_layer):
        super().__init__()

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)

        # Dynamically assign k as 0.5 times the shape of S
        self.rank = max(1, int(S.shape[0] * 0.5))
        
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        
        # Print S matrix and its shape
        print("S matrix shape:", self.S.shape)
        print("S matrix values:")
        print(self.S.cpu().detach().numpy())
        
        return output


def replace_with_low_rank_partial(model, fraction=0.2):
    # Collect all linear layers
    linear_layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            linear_layers.append((name, module))
    
    # Shuffle and select a fraction of them
    random.shuffle(linear_layers)
    num_layers_to_replace = int(len(linear_layers) * fraction)
    layers_to_replace = linear_layers[:num_layers_to_replace]
    
    # Replace selected layers with LowRankLayer
    for name, module in layers_to_replace:
        low_rank_layer = LowRankLayer(module)
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = model.get_submodule(parent_name)
        setattr(parent_module, child_name, low_rank_layer)
        print(f"Replaced layer: {name}")
    
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_model_size(model, tokenizer):
    with tempfile.TemporaryDirectory() as tmpdirname:
        model.save_pretrained(tmpdirname)
        tokenizer.save_pretrained(tmpdirname)
        size = sum(f.stat().st_size for f in os.scandir(tmpdirname) if f.is_file())
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_param_count = count_parameters(model)
original_file_size = get_model_size(model, tokenizer)

# Replace 30% of the linear layers with low-rank approximations
model = replace_with_low_rank_partial(model, fraction=COMPRESSION_FRACTION)

# Get compressed parameter count and file size
compressed_param_count = count_parameters(model)
compressed_file_size = get_model_size(model, tokenizer)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_param_count}")
print(f"Compressed model size (parameters): {compressed_param_count}")
print(f"Parameter compression rate: {(original_param_count - compressed_param_count) / original_param_count:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Compressed model saved to directory: {compressed_model_dir}")

# Clear the original model from memory
del model
torch.cuda.empty_cache()  # If using GPU

print("Original model cleared from memory.")


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from huggingface_hub import HfApi, login

# # Log in to Hugging Face
# login(token="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")

# # Define the model repository namezzaaz x                             xccccccccdvdwvcccv
# repo_name = "pavan01729/Compressed_LLama2_7b_25pcent_partial"  # Replace with your desired repo name

# # Load the compressed model and tokenizer
# compressed_model_dir = "compressed_model"
# tokenizer = AutoTokenizer.from_pretrained(compressed_model_dir)
# model = AutoModelForCausalLM.from_pretrained(compressed_model_dir)

# # Push the model and tokenizer to the Hugging Face Hub
# model.push_to_hub(repo_name, check_pr=True)
# tokenizer.push_to_hub(repo_name, check_pr=True)

# print(f"Model pushed to Hugging Face at: https://huggingface.co/{repo_name}")


In [1]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import tempfile

# Parameters for fine-tuning
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_FRACTION = 0.2
RANK_PERCENTAGE = 0.5  # Percentage of rank to keep in the low-rank decomposition
SELECTION = 'first'

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, full_rank_layer):
        super().__init__()

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)

        # Assign rank as a percentage of the shape of S
        self.rank = max(1, int(S.shape[0] * RANK_PERCENTAGE))
        
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        
        # Print S matrix and its shape
        print("S matrix shape:", self.S.shape)
        print("S matrix values:")
        print(self.S.cpu().detach().numpy())
        
        return output

def replace_with_low_rank_partial(model, fraction=0.2, selection='first'):
    # Collect all linear layers
    linear_layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            linear_layers.append((name, module))
    
    num_layers_to_replace = int(len(linear_layers) * fraction)
    
    if selection == 'first':
        layers_to_replace = linear_layers[:num_layers_to_replace]
    elif selection == 'middle':
        start = (len(linear_layers) - num_layers_to_replace) // 2
        layers_to_replace = linear_layers[start:start + num_layers_to_replace]
    elif selection == 'last':
        layers_to_replace = linear_layers[-num_layers_to_replace:]
    else:
        raise ValueError("Selection must be 'first', 'middle', or 'last'")
    
    # Replace selected layers with LowRankLayer
    for name, module in layers_to_replace:
        low_rank_layer = LowRankLayer(module)
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = model.get_submodule(parent_name)
        setattr(parent_module, child_name, low_rank_layer)
        print(f"Replaced layer: {name}")
    
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_model_size(model, tokenizer):
    with tempfile.TemporaryDirectory() as tmpdirname:
        model.save_pretrained(tmpdirname)
        tokenizer.save_pretrained(tmpdirname)
        size = sum(f.stat().st_size for f in os.scandir(tmpdirname) if f.is_file())
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_param_count = count_parameters(model)
original_file_size = get_model_size(model, tokenizer)

# Replace 20% of the linear layers with low-rank approximations
model = replace_with_low_rank_partial(model, fraction=COMPRESSION_FRACTION, selection=SELECTION)

# Get compressed parameter count and file size
compressed_param_count = count_parameters(model)
compressed_file_size = get_model_size(model, tokenizer)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_param_count}")
print(f"Compressed model size (parameters): {compressed_param_count}")
print(f"Parameter compression rate: {(original_param_count - compressed_param_count) / original_param_count:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Compressed model saved to directory: {compressed_model_dir}")

# Clear the original model from memory
del model
torch.cuda.empty_cache()  # If using GPU

print("Original model cleared from memory.")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

In [2]:
# Test prompt
# prompt = "Hello, how are you today?"
prompt = "Hello, what can u do?"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs, max_length=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Prompt: {prompt}")
print(f"Response: {response}")

NameError: name 'model' is not defined

In [3]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import tempfile

# Parameters for fine-tuning
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'  # Replace with your desired model name
COMPRESSION_FRACTION = 0.5
RANK_PERCENTAGE = 0.5  # Percentage of rank to keep in the low-rank decomposition
SELECTION = 'first'

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    """Given a linear layer, find low rank decomposition."""
    def __init__(self, full_rank_layer):
        super().__init__()

        # Perform SVD on the full-rank layer's weight matrix
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)

        # Assign rank as a percentage of the shape of S
        self.rank = max(1, int(S.shape[0] * RANK_PERCENTAGE))
        
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())

        # Handle the bias term if it exists
        if full_rank_layer.bias is not None:
            self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous())
        else:
            self.bias = None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        
        # Print S matrix and its shape
        print("S matrix shape:", self.S.shape)
        print("S matrix values:")
        print(self.S.cpu().detach().numpy())
        
        return output

def replace_with_low_rank_partial(model, fraction=0.2, selection='first'):
    # Collect all linear layers
    linear_layers = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            linear_layers.append((name, module))
    
    num_layers_to_replace = int(len(linear_layers) * fraction)
    
    if selection == 'first':
        layers_to_replace = linear_layers[:num_layers_to_replace]
    elif selection == 'middle':
        start = (len(linear_layers) - num_layers_to_replace) // 2
        layers_to_replace = linear_layers[start:start + num_layers_to_replace]
    elif selection == 'last':
        layers_to_replace = linear_layers[-num_layers_to_replace:]
    else:
        raise ValueError("Selection must be 'first', 'middle', or 'last'")
    
    # Replace selected layers with LowRankLayer
    for name, module in layers_to_replace:
        low_rank_layer = LowRankLayer(module)
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = model.get_submodule(parent_name)
        setattr(parent_module, child_name, low_rank_layer)
        print(f"Replaced layer: {name}")
    
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_model_size(model, tokenizer):
    with tempfile.TemporaryDirectory() as tmpdirname:
        model.save_pretrained(tmpdirname)
        tokenizer.save_pretrained(tmpdirname)
        size = sum(f.stat().st_size for f in os.scandir(tmpdirname) if f.is_file())
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Get initial parameter count and file size
original_param_count = count_parameters(model)
original_file_size = get_model_size(model, tokenizer)

# Replace 20% of the linear layers with low-rank approximations
model = replace_with_low_rank_partial(model, fraction=COMPRESSION_FRACTION, selection=SELECTION)

# Get compressed parameter count and file size
compressed_param_count = count_parameters(model)
compressed_file_size = get_model_size(model, tokenizer)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_param_count}")
print(f"Compressed model size (parameters): {compressed_param_count}")
print(f"Parameter compression rate: {(original_param_count - compressed_param_count) / original_param_count:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Compressed model saved to directory: {compressed_model_dir}")

# Clear the original model from memory
del model
torch.cuda.empty_cache()  # If using GPU

print("Original model cleared from memory.")


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]


Replaced layer: layers.0.self_attn.q_proj
Replaced layer: layers.0.self_attn.k_proj
Replaced layer: layers.0.self_attn.v_proj
Replaced layer: layers.0.self_attn.o_proj
Replaced layer: layers.0.mlp.gate_proj
Replaced layer: layers.0.mlp.up_proj
Replaced layer: layers.0.mlp.down_proj
Replaced layer: layers.1.self_attn.q_proj
Replaced layer: layers.1.self_attn.k_proj
Replaced layer: layers.1.self_attn.v_proj
Replaced layer: layers.1.self_attn.o_proj
Replaced layer: layers.1.mlp.gate_proj
Replaced layer: layers.1.mlp.up_proj
Replaced layer: layers.1.mlp.down_proj
Replaced layer: layers.2.self_attn.q_proj
Replaced layer: layers.2.self_attn.k_proj
Replaced layer: layers.2.self_attn.v_proj
Replaced layer: layers.2.self_attn.o_proj
Replaced layer: layers.2.mlp.gate_proj
Replaced layer: layers.2.mlp.up_proj
Replaced layer: layers.2.mlp.down_proj
Replaced layer: layers.3.self_attn.q_proj
Replaced layer: layers.3.self_attn.k_proj
Replaced layer: layers.3.self_attn.v_proj
Replaced layer: layers.3.

In [5]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.nn import functional as F
import random
import tempfile
from datasets import load_dataset

# Specify the model name, compression fraction, and rank fraction
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'
COMPRESSION_FRACTION = 0.2
RANK_FRACTION = 0.5  # Fraction of the rank to be used in the low-rank approximation

# Define LowRankLayer class for low-rank decomposition
class LowRankLayer(nn.Module):
    def __init__(self, full_rank_layer, rank_fraction=0.5):
        super().__init__()
        U, S, Vh = torch.linalg.svd(full_rank_layer.weight.float())
        S_diag = torch.diag(S)
        self.rank = max(1, int(S.shape[0] * rank_fraction))
        self.U = nn.Parameter(U[:, :self.rank].contiguous())
        self.S = nn.Parameter(S_diag[:self.rank, :self.rank].contiguous())
        self.Vh = nn.Parameter(Vh[:self.rank, :].contiguous())
        self.bias = nn.Parameter(full_rank_layer.bias.float().contiguous()) if full_rank_layer.bias is not None else None

    def forward(self, x):
        aprox_weight_matrix = self.U @ self.S @ self.Vh
        output = F.linear(x, aprox_weight_matrix, self.bias)
        return output

def replace_with_low_rank_partial(model, fraction=0.2, rank_fraction=0.5):
    linear_layers = [(name, module) for name, module in model.named_modules() if isinstance(module, nn.Linear)]
    linear_layers.sort(key=lambda x: x[1].weight.numel(), reverse=True)
    num_layers_to_replace = int(len(linear_layers) * fraction)
    layers_to_replace = linear_layers[:num_layers_to_replace]
    
    for name, module in layers_to_replace:
        low_rank_layer = LowRankLayer(module, rank_fraction)
        if '.' in name:
            parent_name, child_name = name.rsplit('.', 1)
            parent_module = model.get_submodule(parent_name)
            setattr(parent_module, child_name, low_rank_layer)
        else:
            setattr(model, name, low_rank_layer)
        print(f"Replaced layer: {name}")
    
    return model

# Function to calculate the total number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_model_size(model, tokenizer):
    with tempfile.TemporaryDirectory() as tmpdirname:
        model.save_pretrained(tmpdirname)
        tokenizer.save_pretrained(tmpdirname)
        size = sum(f.stat().st_size for f in os.scandir(tmpdirname) if f.is_file())
    return size

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Get initial parameter count and file size
original_param_count = count_parameters(model)
original_file_size = get_model_size(model, tokenizer)

# Replace a fraction of the linear layers with low-rank approximations
model = replace_with_low_rank_partial(model, fraction=COMPRESSION_FRACTION, rank_fraction=RANK_FRACTION)

# Load WikiText dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
train_dataset = dataset['train']

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Define data collator
data_collator = lambda data: {'input_ids': torch.tensor([f['input_ids'] for f in data]), 'labels': torch.tensor([f['input_ids'] for f in data])}

# Fine-tune the model
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    max_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)

trainer.train()

# Get compressed parameter count and file size
compressed_param_count = count_parameters(model)
compressed_file_size = get_model_size(model, tokenizer)

# Save the compressed model to the directory
compressed_model_dir = "compressed_model"
tokenizer.save_pretrained(compressed_model_dir)
model.save_pretrained(compressed_model_dir)

# Print sizes and compression rates
print(f"Original model size (parameters): {original_param_count}")
print(f"Compressed model size (parameters): {compressed_param_count}")
print(f"Parameter compression rate: {(original_param_count - compressed_param_count) / original_param_count:.2%}")

print(f"Original model file size: {original_file_size / (1024 ** 2):.2f} MB")
print(f"Compressed model file size: {compressed_file_size / (1024 ** 2):.2f} MB")
print(f"File size compression rate: {(original_file_size - compressed_file_size) / original_file_size:.2%}")

print(f"Compressed model saved to directory: {compressed_model_dir}")

# Test the fine-tuned model with a "hello" prompt
model = AutoModelForCausalLM.from_pretrained(compressed_model_dir)
tokenizer = AutoTokenizer.from_pretrained(compressed_model_dir)

input_text = "hello"
inputs = tokenizer(input_text, return_tensors='pt')
outputs = model.generate(**inputs)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input: {input_text}")
print(f"Generated Output: {generated_text}")

# Clear the original model from memory
del model
torch.cuda.empty_cache()  # If using GPU

print("Original model cleared from memory.")


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.23s/it]
Using pad_token, but it is not set yet.


Replaced layer: lm_head
Replaced layer: model.layers.0.mlp.gate_proj
Replaced layer: model.layers.0.mlp.up_proj
Replaced layer: model.layers.0.mlp.down_proj
Replaced layer: model.layers.1.mlp.gate_proj
Replaced layer: model.layers.1.mlp.up_proj
Replaced layer: model.layers.1.mlp.down_proj
Replaced layer: model.layers.2.mlp.gate_proj
Replaced layer: model.layers.2.mlp.up_proj
Replaced layer: model.layers.2.mlp.down_proj
Replaced layer: model.layers.3.mlp.gate_proj
Replaced layer: model.layers.3.mlp.up_proj
Replaced layer: model.layers.3.mlp.down_proj
Replaced layer: model.layers.4.mlp.gate_proj
Replaced layer: model.layers.4.mlp.up_proj
Replaced layer: model.layers.4.mlp.down_proj
Replaced layer: model.layers.5.mlp.gate_proj
Replaced layer: model.layers.5.mlp.up_proj
Replaced layer: model.layers.5.mlp.down_proj
Replaced layer: model.layers.6.mlp.gate_proj
Replaced layer: model.layers.6.mlp.up_proj
Replaced layer: model.layers.6.mlp.down_proj
Replaced layer: model.layers.7.mlp.gate_proj


Map: 100%|██████████| 36718/36718 [00:06<00:00, 5513.38 examples/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 