- **Author:** **Kandimalla Hemanth**
- **Date:** **08-01-2024**
- **E-mail:** **speechcodehemanth2@gmail.com**
- **What this Google Colab is about:** **Performances of LLMs on different types quantization**

In [None]:
!pip install -q  -U bitsandbytes>=0.39.0
!pip install -q  -U git+https://github.com/huggingface/accelerate.git
!pip install -q  -U git+https://github.com/huggingface/transformers.git

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def uniform_quantize(X, bits):
    qmin = -2**(bits - 1)
    qmax = 2**(bits - 1) - 1
    scale = (torch.max(X) - torch.min(X)) / (qmax - qmin)
    X_quant = torch.round(X / scale) * scale
    return X_quant

def non_uniform_quantize(X, bits):
    # Mu-law non-uniform quantization
    mu = 255.0  # Convert mu to a float to avoid integer division
    mu_tensor = torch.tensor(mu, device=X.device, dtype=X.dtype)  # Convert mu to a tensor
    X_mu = torch.sign(X) * torch.log1p(mu_tensor * torch.abs(X)) / torch.log1p(mu_tensor)

    # Scale X_mu to the range [-1, 1] for uniform quantization
    X_mu = X_mu / torch.max(torch.abs(X_mu))

    # Apply uniform quantization
    X_quant = uniform_quantize(X_mu, bits)

    # Inverse mu-law transformation
    X_dequant = torch.sign(X_quant) * (torch.exp(torch.abs(X_quant) * torch.log1p(mu_tensor)) - 1) / mu_tensor
    return X_dequant

torch.manual_seed(0)
device = 'cpu'
model_id = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
weights = model.transformer.h[0].attn.c_attn.weight.data

# Quantize and dequantize the weights with uniform and non-uniform quantization
bits_list = [4, 8, 16]

uniform_quant_weights = {}
non_uniform_quant_weights = {}

for bits in bits_list:
    uniform_quant = uniform_quantize(weights, bits=bits).to(device)
    non_uniform_quant = non_uniform_quantize(weights, bits=bits).to(device)

    uniform_quant_weights[bits] = uniform_quant
    non_uniform_quant_weights[bits] = non_uniform_quant

# Plotting
plt.style.use('ggplot')
fig, axs = plt.subplots(len(bits_list), 2, figsize=(12, 6*len(bits_list)), dpi=100, sharex=True)

for i, bits in enumerate(bits_list):
    axs[i, 0].hist(weights.cpu().numpy().flatten(), bins=150, alpha=0.5, label='Original weights', color='blue', range=(-1, 1))
    axs[i, 0].hist(uniform_quant_weights[bits].cpu().numpy().flatten(), bins=150, alpha=0.5, label=f'{bits}-bit Uniform weights', color='red', range=(-1, 1))
    axs[i, 0].set_title(f'Original vs {bits}-bit Uniform Quantized Weights', fontsize=16)

    axs[i, 1].hist(weights.cpu().numpy().flatten(), bins=150, alpha=0.5, label='Original weights', color='blue', range=(-1, 1))
    axs[i, 1].hist(non_uniform_quant_weights[bits].cpu().numpy().flatten(), bins=150, alpha=0.5, label=f'{bits}-bit Non-Uniform weights', color='green', range=(-1, 1))
    axs[i, 1].set_title(f'Original vs {bits}-bit Non-Uniform Quantized Weights', fontsize=16)

    axs[i, 0].legend()
    axs[i, 1].legend()
    axs[i, 0].yaxis.set_major_formatter(ticker.EngFormatter())
    axs[i, 1].yaxis.set_major_formatter(ticker.EngFormatter())

plt.tight_layout()
plt.show()

In [None]:
def generate_text(model, input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        do_sample=True,
        top_k=30,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=input_ids.new_ones(input_ids.shape)
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)



# Generate text with original weights
original_text = generate_text(model, "I have a dream")
print(f"Original model:\n{original_text}\n" + "-" * 50)

# Function to update model weights
def update_model_weights(model, new_weights):
    own_state = model.state_dict()
    for name, param in new_weights.items():
        if name in own_state:
            own_state[name].copy_(param)

# Generate and compare text using quantized weights
for bits in bits_list:
    # Uniform quantization
    model_uniform_quant = deepcopy(model)
    uniform_weights = {name: uniform_quantize(param.data, bits).to(device) for name, param in model_uniform_quant.named_parameters()}
    update_model_weights(model_uniform_quant, uniform_weights)
    uniform_text = generate_text(model_uniform_quant, "I have a dream")
    print(f"{bits}-bit Uniform Quantized model:\n{uniform_text}\n" + "-" * 50)

    # Non-uniform quantization
    model_non_uniform_quant = deepcopy(model)
    non_uniform_weights = {name: non_uniform_quantize(param.data, bits).to(device) for name, param in model_non_uniform_quant.named_parameters()}
    update_model_weights(model_non_uniform_quant, non_uniform_weights)
    non_uniform_text = generate_text(model_non_uniform_quant, "I have a dream")
    print(f"{bits}-bit Non-Uniform Quantized model:\n{non_uniform_text}\n" + "-" * 50)

In [None]:
!pip install -q -U seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def uniform_quantize(X, bits):
    qmin = -2**(bits - 1)
    qmax = 2**(bits - 1) - 1
    scale = (torch.max(X) - torch.min(X)) / (qmax - qmin)
    X_quant = torch.round(X / scale) * scale
    return X_quant

def non_uniform_quantize(X, bits):
    # Mu-law non-uniform quantization
    mu = 255.0  # Convert mu to a float to avoid integer division
    mu_tensor = torch.tensor(mu, device=X.device, dtype=X.dtype)  # Convert mu to a tensor
    X_mu = torch.sign(X) * torch.log1p(mu_tensor * torch.abs(X)) / torch.log1p(mu_tensor)

    # Scale X_mu to the range [-1, 1] for uniform quantization
    X_mu = X_mu / torch.max(torch.abs(X_mu))

    # Apply uniform quantization
    X_quant = uniform_quantize(X_mu, bits)

    # Inverse mu-law transformation
    X_dequant = torch.sign(X_quant) * (torch.exp(torch.abs(X_quant) * torch.log1p(mu_tensor)) - 1) / mu_tensor
    return X_dequant

torch.manual_seed(0)
device = 'cpu'
model_id = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
weights = model.transformer.h[0].attn.c_attn.weight.data

# Set the style of the seaborn plot
sns.set(style="whitegrid")

# Create a figure and axes with subplots
fig, axs = plt.subplots(3, 2, figsize=(14, 18))

# Define bit levels and corresponding colors for the plot
bits_levels = [4, 8, 16]
colors = ['red', 'green', 'blue']

# Flatten the original weights for plotting
original_weights_flat = weights.cpu().numpy().flatten()

# Plot density plots for quantized weights
for i, bits in enumerate(bits_levels):
    sns.histplot(original_weights_flat, bins=150, kde=True, color="black", ax=axs[i, 0], label="Original weights")
    sns.histplot(uniform_quant_weights[bits].cpu().numpy().flatten(), bins=150, kde=True, color=colors[i], ax=axs[i, 0], label=f"{bits}-bit Uniform weights")
    axs[i, 0].set_title(f"Original vs {bits}-bit Uniform Quantized Weights")
    axs[i, 0].legend()

    sns.histplot(original_weights_flat, bins=150, kde=True, color="black", ax=axs[i, 1], label="Original weights")
    sns.histplot(non_uniform_quant_weights[bits].cpu().numpy().flatten(), bins=150, kde=True, color=colors[i], ax=axs[i, 1], label=f"{bits}-bit Non-Uniform weights")
    axs[i, 1].set_title(f"Original vs {bits}-bit Non-Uniform Quantized Weights")
    axs[i, 1].legend()

# Adjust the layout
plt.tight_layout()
plt.show()