<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/HIGGS_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env --quiet
import warnings

warnings.filterwarnings("ignore")

import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [1]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
from scipy.stats import norm
from tqdm import tqdm
import time
import os

In [None]:
model_name = "gpt2"


try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Explicitly set the padding token if it's not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Or a new token like '[PAD]'
    print(f"Successfully loaded {model_name} on {device}")

except Exception as e:
    print(f"Error loading {model_name}: {e}")
    print("Make sure you have access to this model and the necessary libraries are installed.")
    exit()

In [42]:
def calculate_perplexity(model, encodings):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    total_log_prob = 0
    total_num_tokens = 0
    max_length = encodings.input_ids.size(1)  # Set max_length to the actual sequence length
    with torch.no_grad():
        # Remove the loop, process the entire sequence at once
        input_ids = encodings.input_ids
        attention_mask = encodings.attention_mask
        # The labels should be shifted by one position to the left and the last token should be discarded
        labels = encodings.input_ids[:, 1:].contiguous()  # Shift labels here
        # Remove the last token from input_ids and attention_mask
        input_ids = input_ids[:, :-1].contiguous()
        attention_mask = attention_mask[:, :-1].contiguous()

        # Ensure labels are the same shape as logits (after shifting)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Logits shape: [batch_size, sequence_length, vocab_size]
        # Labels shape: [batch_size, sequence_length]
        # We need to shift and flatten to compare them correctly
        logits = outputs.logits
        shift_logits = logits.contiguous().view(-1, logits.size(-1))
        shift_labels = labels.contiguous().view(-1)  # Labels already shifted

        # Calculate log probabilities
        log_probs = torch.log_softmax(shift_logits, dim=-1)
        # Gather log probs for the actual labels
        token_log_probs = log_probs.gather(dim=1, index=shift_labels.unsqueeze(-1)).squeeze(-1)

        # Weight by attention mask to exclude padding
        total_log_prob += (token_log_probs * attention_mask.contiguous().view(-1)).sum()
        total_num_tokens += attention_mask.sum().item()

        # Perplexity is exp(-log_likelihood / num_tokens)
        avg_log_prob = total_log_prob / total_num_tokens
        perplexity = torch.exp(-avg_log_prob)

    return perplexity

In [43]:
import copy
import torch
import torch.nn as nn
import torch.quantization
import numpy as np

def hadamard_transform_1d(x):
    n = x.shape[0]
    if n == 1:
        return x
    h_half = hadamard_transform_1d(x[:n//2])
    return torch.cat((h_half + x[n//2:], h_half - x[n//2:]), dim=0)

def apply_standard_hadamard(weight):
    original_shape = weight.shape
    flattened = weight.flatten()
    n = flattened.shape[0]
    if np.log2(n).is_integer():
        transformed = hadamard_transform_1d(flattened)
        return transformed.reshape(original_shape)
    else:
        next_power_of_2 = 2**int(np.ceil(np.log2(n)))
        padding = torch.zeros(next_power_of_2 - n, device=weight.device)
        padded = torch.cat((flattened, padding))
        transformed_padded = hadamard_transform_1d(padded)
    return transformed_padded[:n].reshape(original_shape)

def gaussian_inspired_quantize(tensor, num_bits=4):
    num_levels = 2**num_bits
    min_val = tensor.min()
    max_val = tensor.max()
    scale = (max_val - min_val) / (num_levels - 1)
    #zero_point = min_val / scale  # Not used in this quantization method
    normalized = (tensor - min_val) / scale
    quantized = torch.round(torch.clamp(normalized, 0, num_levels - 1)).type(torch.int8)
    dequantized = (quantized.float() * scale) + min_val
    return quantized, dequantized

In [51]:
import copy
def apply_conceptual_higgs_to_MODEL(model, num_bits=4, apply_hadamard=False, apply_quantize=False, device="cuda"):
    """Applies the conceptual HIGGS compression to a model directly (in-place).
    """
    compressed_model = copy.deepcopy(model)

    for name, module in compressed_model.named_modules():
        if isinstance(module, (nn.Linear, nn.Embedding)):
            if hasattr(module, 'weight') and module.weight.requires_grad:
                original_weights = module.weight.data.cpu().clone().detach()

                #print(f"\nProcessing layer: {name}.weight")
                #print(f"  Original weights: min={original_weights.min()}, max={original_weights.max()}")

                transformed_weights = original_weights
                if apply_hadamard:
                    transformed_weights = apply_standard_hadamard(transformed_weights)
                    #print(f"  After Hadamard: min={transformed_weights.min()}, max={transformed_weights.max()}")

                if apply_quantize:
                    quantized_weights, dequantized_weights = gaussian_inspired_quantize(transformed_weights, num_bits)
                    #print(f"  After Quantization: min={quantized_weights.min()}, max={quantized_weights.max()}")

                    # Do not assign int8 data. Directly use Hadamard Transformed Data
                    #module.weight.data = quantized_weights.type(torch.int8).to(device)
                    module.weight.data = dequantized_weights.to(device)  # Use dequantized weights

                else:  # If not applying quantization, use the transformed weights directly
                    module.weight.data = transformed_weights.to(device)

                #print(f"Applied conceptual HIGGS (Hadamard={apply_hadamard}, Quantize={apply_quantize}) to {name}.weight")
                del original_weights
                torch.cuda.empty_cache()

    return compressed_model

In [45]:
import sys

def get_model_size(model):
  """Calculate the size of the model in MB."""
  param_size = 0
  for param in model.parameters():
    param_size += param.nelement() * param.element_size()
  buffer_size = 0
  for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

  size_all_mb = (param_size + buffer_size) / 1024**2
  print('Model Size: {:.3f} MB'.format(size_all_mb))
  return (param_size + buffer_size)

In [None]:
# --- Evaluation Code ---

# 1. Prepare Evaluation Data (replace with your actual dataset)
eval_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question.",
    # Add more text sequences here
]

# Tokenize the Eval Data
eval_encodings = tokenizer(eval_texts, truncation=True, padding=True, return_tensors="pt").to(device)

# 3. Get Original Model Size
original_size = get_model_size(model)
print(f"\nOriginal Model Size: {original_size} parameters")
print('\n\n')

# 4. Apply HIGGS and get the compressed model
print("\n--- Applying Conceptual HIGGS ---")
compressed_model = apply_conceptual_higgs_to_MODEL( # Call the function to modify the model in-place
    model,
    #num_bits=4,
    apply_hadamard=True,
    apply_quantize=True,
    device=device # Pass the device to the function
    #target_layers=["lm_head.weight", "model.layers.0.self_attn.q_proj.weight"]
)

# Save the original model's state_dict
torch.save(model.state_dict(), 'original_model.pth')

# Save the compressed model's state_dict
torch.save(compressed_model.state_dict(), 'compressed_model.pth')

# Load the quantized compressed model's state_dict
compressed_state_dict = torch.load('compressed_model.pth', map_location=device) # Load on the correct device
compressed_model = model.from_pretrained(model_name).to(device) # Instantiate and move to device
compressed_model.load_state_dict(compressed_state_dict)

# 4. Evaluate Original Model (Baseline)
print("\n--- Evaluating Original Model (Baseline) ---")
original_perplexity = calculate_perplexity(model, eval_encodings)
print(f"Original Model Perplexity: {original_perplexity:.4f}")

# Get Compressed Model Size
compressed_size = get_model_size(compressed_model)
print(f"\nCompressed Model Size: {compressed_size} parameters")

# Evaluate Compressed Model
print("\n--- Evaluating Compressed Model ---")
start_time = time.time()  # Record start time
compressed_perplexity = calculate_perplexity(compressed_model, eval_encodings)
end_time = time.time()  # Record end time
inference_time = end_time - start_time
print(f"Compressed Model Perplexity: {compressed_perplexity:.4f}")
print(f"Inference Time: {inference_time:.4f} seconds")

# Compare and Report
print("\n--- Comparison ---")
print(f"Change in Perplexity: {compressed_perplexity - original_perplexity:.4f}")
print(f"Percentage Change in Perplexity: {(compressed_perplexity / original_perplexity - 1) * 100:.2f}%")
print(f"Change in Model Size: {compressed_size - original_size} parameters")
print(f"Percentage Change in Model Size: {(compressed_size / original_size - 1) * 100:.2f}%")