<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/HIGGS_DEMO_LLAMA3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env --quiet
import warnings

warnings.filterwarnings("ignore")

import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

from huggingface_hub import login

login(
  token=access_token_write,
  add_to_git_credential=True
)

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
from scipy.stats import norm

# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_id = "meta-llama/Meta-Llama-3-8B"

try:
    # Load the pre-trained Llama 3 8B model and tokenizer on the selected device
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print(f"Successfully loaded {model_id}")
except Exception as e:
    print(f"Error loading {model_id}: {e}")
    print("Make sure you have access to this model and the necessary libraries are installed.")
    exit()

# --- Concrete (Standard) Hadamard Transform (1D) ---
def hadamard_transform_1d(x):
    n = x.shape[0]
    if n == 1:
        return x
    h_half = hadamard_transform_1d(x[:n//2])
    return torch.cat((h_half + x[n//2:], h_half - x[n//2:]), dim=0)

def apply_standard_hadamard(weight):
    original_shape = weight.shape
    flattened = weight.flatten()
    n = flattened.shape[0]
    if np.log2(n).is_integer():
        transformed = hadamard_transform_1d(flattened)
        return transformed.reshape(original_shape)
    else:
        # Pad to the nearest power of 2 (conceptual handling)
        next_power_of_2 = 2**int(np.ceil(np.log2(n)))
        padding = torch.zeros(next_power_of_2 - n, device=weight.device)  # Create padding on the same device
        padded = torch.cat((flattened, padding))
        transformed_padded = hadamard_transform_1d(padded)
        return transformed_padded[:n].reshape(original_shape)

# --- Gaussian-Inspired Quantization ---
def gaussian_inspired_quantize(tensor, num_bits):
    num_levels = 2**num_bits
    # Move tensor to CPU before calculating mean and std
    mean = tensor.float().cpu().mean()
    std = tensor.float().cpu().std()
    quantized = torch.zeros_like(tensor, dtype=torch.int, device=tensor.device) # Create quantized on the same device
    scale = (norm.ppf(0.99, loc=mean, scale=std) - norm.ppf(0.01, loc=mean, scale=std)) / (num_levels - 1) if num_levels > 1 else 1.0
    zero_point = -norm.ppf(0.01, loc=mean, scale=std) / scale if scale != 0 else 0

    normalized = (tensor - norm.ppf(0.01, loc=mean, scale=std)) / scale
    quantized = torch.round(torch.clamp(normalized, 0, num_levels - 1)).int()

    dequantized = (quantized.float() * scale) + norm.ppf(0.01, loc=mean, scale=std)
    return quantized, dequantized

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
from scipy.stats import norm

def apply_conceptual_higgs_to_llama3(model, num_bits=4):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) or isinstance(module, nn.Embedding):
            if hasattr(module, 'weight') and module.weight.requires_grad:
                # 1. Move original weights to CPU to free up GPU memory
                original_weights = module.weight.data.cpu().clone()

                # 2. Apply standard Hadamard transform
                transformed_weights = apply_standard_hadamard(original_weights)

                # 3. Apply Gaussian-inspired quantization
                quantized_weights, dequantized_weights = gaussian_inspired_quantize(transformed_weights, num_bits)

                # 4. Delete transformed_weights to free memory
                del transformed_weights

                # 5. Replace original weights and move back to GPU
                module.weight.data = dequantized_weights.to(device) # Move to GPU for inference/training
                print(f"Applied conceptual HIGGS (Hadamard + Gaussian quant) to {name}.weight")

                # 6. Delete unnecessary tensors
                del original_weights, quantized_weights, dequantized_weights

                # 7. Empty cache to free up memory
                torch.cuda.empty_cache()

In [3]:
# Apply the conceptual HIGGS to the loaded Llama 3 model
apply_conceptual_higgs_to_llama3(model, num_bits=4)

print("\nModel weights after conceptual HIGGS:")
for name, param in model.named_parameters():
    print(name, param.dtype, param.shape, param.requires_grad)

Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.embed_tokens.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.self_attn.q_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.self_attn.k_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.self_attn.v_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.self_attn.o_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.mlp.gate_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.mlp.up_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.0.mlp.down_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.1.self_attn.q_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.1.self_attn.k_proj.weight
Applied conceptual HIGGS (Hadamard + Gaussian quant) to model.layers.1.sel