In [None]:
!pip install -q -U bitsandbytes>=0.39.0
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/transformers.git

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)

# Set device to CPU for now
device = 'cuda'

# Load model and tokenizer
model_id = 'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Print model size
print(f"Model size: {model.get_memory_footprint():,} bytes")

In [None]:
import torch

def uniform_quantize(X, num_bits):
    # Calculate the scale factor
    scale = 2 ** (num_bits - 1) - 1
    min_val = torch.min(X)
    max_val = torch.max(X)
    if min_val == max_val:
        min_val -= 1
        max_val += 1
    scale /= max(max_val - min_val, 1e-7)

    # Quantize the values
    X_quant = torch.round((X - min_val) * scale)
    X_quant = torch.clamp(X_quant, 0, 2**num_bits - 1)

    # Dequantize the values
    X_dequant = X_quant / scale + min_val

    return X_quant.to(torch.int), X_dequant

# # Example usage
# X = torch.randn(100)
# for num_bits in [2, 4, 8, 16, 32]:
#     X_quant, X_dequant = uniform_quantize(X, num_bits)
#     print(f"{num_bits}-bit quantization:")
#     print(f"Quantized values: {X_quant}")
#     print(f"Dequantized values: {X_dequant}")
#     print(f"Max error: {torch.max(torch.abs(X - X_dequant))}")

In [None]:
import torch

def kl_div_quantize(X, num_bits, eps=1e-7):
    # Calculate the probability distribution of the values
    probs = torch.histc(X, bins=2**num_bits, min=X.min(), max=X.max())
    probs = probs.float() / probs.sum()

    # Calculate the cumulative distribution function (CDF)
    cdf = torch.cumsum(probs, dim=0)

    # Calculate the quantization levels
    levels = torch.zeros(2**num_bits + 1)
    for i in range(1, 2**num_bits + 1):
        levels[i] = (cdf[i-1] + cdf[min(i, 2**num_bits - 1)]) / 2
    levels = levels[1:]

    # Quantize the values
    X_quant = torch.floor((X - levels[0]) / (levels[1] - levels[0]) * 2**num_bits)
    X_quant = torch.clamp(X_quant, 0, 2**num_bits - 1)

    # Dequantize the values
    X_dequant = (X_quant + 0.5) / 2**num_bits * (levels[1] - levels[0]) + levels[0]

    return X_quant.to(torch.int), X_dequant

# # Example usage
# X = torch.randn(100)
# for num_bits in [2, 4, 8, 16, 32]:
    # X_quant, X_dequant = kl_div_quantize(X, num_bits)
    # print(f"{num_bits}-bit non-uniform quantization:")
    # print(f"Quantized values: {X_quant}")
    # print(f"Dequantized values: {X_dequant}")
    # print(f"Max error: {torch.max(torch.abs(X - X_dequant))}")

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime
from copy import deepcopy

# Assuming 'model', 'uniform_quantize', and 'kl_div_quantize' are pre-defined

def flatten_and_concatenate_weights(weights_list):
    return np.concatenate([weight.flatten() for weight in weights_list])

# Function to quantize and store model weights
def quantize_and_store(model, quantize_func, bit_width, weights_list):
    for param in model.parameters():
        _, dequantized = quantize_func(param.data, bit_width)
        weights_list.append(dequantized.cpu().numpy().flatten())

# Store original weights and flatten them
original_weights = flatten_and_concatenate_weights([param.data.clone().cpu() for param in model.parameters()])

# Quantize models and store weights (optimized to avoid keeping multiple copies of the model)
bit_widths = [2 ]
weights_uniform = {bw: [] for bw in bit_widths}
weights_non_uniform = {bw: [] for bw in bit_widths}

for bw in bit_widths:
    model_uniform = deepcopy(model)
    quantize_and_store(model_uniform, uniform_quantize, bw, weights_uniform[bw])
    del model_uniform  # Free up memory immediately after use

    model_non_uniform = deepcopy(model)
    quantize_and_store(model_non_uniform, kl_div_quantize, bw, weights_non_uniform[bw])
    del model_non_uniform  # Free up memory immediately after use

# Flatten and concatenate weights
weights_uniform = {bw: flatten_and_concatenate_weights(weights_uniform[bw]) for bw in bit_widths}
weights_non_uniform = {bw: flatten_and_concatenate_weights(weights_non_uniform[bw]) for bw in bit_widths}

# Function to plot the CDF
def plot_cdf(data, ax, label, color):
    sorted_data = np.sort(data)
    cdf = np.arange(1, len(sorted_data) + 1) / float(len(sorted_data))
    ax.plot(sorted_data, cdf, label=label, color=color)

# Set background style and plot CDFs
plt.style.use('ggplot')
fig, axs = plt.subplots(2, figsize=(10,10), dpi=300, sharex=True)

# Plot the CDFs for uniform and non-uniform weights
colors = ['red', 'green', 'black', 'grey']
for i, bw in enumerate(bit_widths):
    plot_cdf(weights_uniform[bw], axs[0], f'Uniform weights ({bw}-bit)', colors[i])
    plot_cdf(weights_non_uniform[bw], axs[1], f'Non-uniform weights ({bw}-bit)', colors[i])

# Plot the CDF for original weights
plot_cdf(original_weights, axs[0], 'Original weights', 'blue')
plot_cdf(original_weights, axs[1], 'Original weights', 'blue')

# Customize the plots
for ax in axs:
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend(loc='lower right')
    ax.set_xlabel('Weights', fontsize=14)
    ax.set_ylabel('Cumulative Distribution Function', fontsize=14)
    ax.set_title('CDF of Original and Quantized Weights', fontsize=16)

# Improve font and layout
plt.rc('font', size=12)
plt.tight_layout()

# Save plot with a unique name
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
plt.savefig(f"our_results_{current_time}.png")
plt.show()

In [None]:
# simple explains
# Original Floats
original_floats = [-0.9, -0.2, 0.0, 0.4, 0.8]

# Scaling Factor
s = 7.78 # (based on 7 / max(abs(data)))

# Quantized Integers
quantized_integers = [round(x * s) for x in original_floats]

# Print the results
print("Original Floats:", original_floats)
print("Scaling Factor:", s)
print("Quantized Integers:", quantized_integers)


import numpy as np

# Generate 100 random float32 numbers
original_floats = np.random.rand(100).astype(np.float32)

# Scaling Factor
s = 7.78 # (based on 7 / max(abs(data)))

# Quantize the float32 numbers
quantized_integers = np.round(original_floats * s).astype(np.int8)

# Calculate memory usage
import sys
mem_before = sys.getsizeof(original_floats)
mem_after = sys.getsizeof(quantized_integers)

# Print the results
print("Memory usage before quantization:", mem_before)
print("Memory usage after quantization:", mem_after)

In [None]:
import torch


# Extract weights of the first layer
weights = model.transformer.h[0].attn.c_attn.weight.data
print("Original weights:")
print(weights)

# Quantize layer using uniform quantization with different bit widths
weights_uniform_quant_2, _ = uniform_quantize(weights, 2)
weights_uniform_quant_4, _ = uniform_quantize(weights, 4)
weights_uniform_quant_8, _ = uniform_quantize(weights, 8)
weights_uniform_quant_16, _ = uniform_quantize(weights, 16)

print("\nUniform quantized weights:")
print(f"\n2-bit quantized weights:\n{weights_uniform_quant_2.int()}")
print(f"\n4-bit quantized weights:\n{weights_uniform_quant_4.int()}")
print(f"\n8-bit quantized weights:\n{weights_uniform_quant_8.int()}")
print(f"\n16-bit quantized weights:\n{weights_uniform_quant_16.int()}")

# Quantize layer using non-uniform quantization with different bit widths
weights_non_uniform_quant_2, _ = kl_div_quantize(weights, 2)
weights_non_uniform_quant_4, _ = kl_div_quantize(weights, 4)
weights_non_uniform_quant_8, _ = kl_div_quantize(weights, 8)
weights_non_uniform_quant_16, _ = kl_div_quantize(weights, 16)

print("\nNon-uniform quantized weights:")
print(f"\n2-bit quantized weights:\n{weights_non_uniform_quant_2.int()}")
print(f"\n4-bit quantized weights:\n{weights_non_uniform_quant_4.int()}")
print(f"\n8-bit quantized weights:\n{weights_non_uniform_quant_8.int()}")
print(f"\n16-bit quantized weights:\n{weights_non_uniform_quant_16.int()}")

In [None]:
import torch
from copy import deepcopy



# Store original weights
original_weights = [param.data.clone().cpu().numpy().flatten() for param in model.parameters()]

# Create lists to hold the flattened quantized weights
weights_uniform_2 = []
weights_uniform_4 = []
weights_uniform_8 = []
weights_uniform_16 = []

weights_non_uniform_2 = []
weights_non_uniform_4 = []
weights_non_uniform_8 = []
weights_non_uniform_16 = []

# Function to quantize and store model weights
def quantize_and_store(model, quantize_func, bit_width, weights_list):
    for param in model.parameters():
        _, dequantized = quantize_func(param.data, bit_width)
        weights_list.extend(dequantized.cpu().numpy().flatten())

# Quantize models using uniform quantization with different bit widths
model_uniform_2 = deepcopy(model)
model_uniform_4 = deepcopy(model)
model_uniform_8 = deepcopy(model)
model_uniform_16 = deepcopy(model)

quantize_and_store(model_uniform_2, uniform_quantize, 2, weights_uniform_2)
quantize_and_store(model_uniform_4, uniform_quantize, 4, weights_uniform_4)
quantize_and_store(model_uniform_8, uniform_quantize, 8, weights_uniform_8)
quantize_and_store(model_uniform_16, uniform_quantize, 16, weights_uniform_16)

# Quantize models using non-uniform (KL divergence) quantization
model_non_uniform_2 = deepcopy(model)
model_non_uniform_4 = deepcopy(model)
model_non_uniform_8 = deepcopy(model)
model_non_uniform_16 = deepcopy(model)

quantize_and_store(model_non_uniform_2, kl_div_quantize, 2, weights_non_uniform_2)
quantize_and_store(model_non_uniform_4, kl_div_quantize, 4, weights_non_uniform_4)
quantize_and_store(model_non_uniform_8, kl_div_quantize, 8, weights_non_uniform_8)
quantize_and_store(model_non_uniform_16, kl_div_quantize, 16, weights_non_uniform_16)



In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker



weights = np.concatenate([t.cpu().numpy().flatten() for t in original_weights])
weights_uniform_2 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_2])
weights_uniform_4 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_4])
weights_uniform_8 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_8])
weights_uniform_16 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_16])


weights = np.concatenate([t.cpu().numpy().flatten() for t in original_weights])
weights_non_uniform_2 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_2])
weights_non_uniform_4 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_4])
weights_non_uniform_8 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_8])
weights_non_uniform_16 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_16])

# Set background style
plt.style.use('ggplot')
# Create figure and axes
fig, axs = plt.subplots(2, figsize=(10,10), dpi=300, sharex=True)
# Plot the histograms for original and zero-point weights
axs[0].hist(weights, bins=150, alpha=0.5, label='Original weights', color='blue', range=(-2, 2))
axs[0].hist(weights_uniform_2, bins=150, alpha=0.5, label='Uniform weights (2-bit)', color='red', range=(-2, 2))
axs[0].hist(weights_uniform_4, bins=150, alpha=0.5, label='Uniform weights (4-bit)', color='green', range=(-2, 2))
axs[0].hist(weights_uniform_8, bins=150, alpha=0.5, label='Uniform weights (8-bit)', color='black', range=(-2, 2))
axs[0].hist(weights_uniform_16, bins=150, alpha=0.5, label='Uniform weights (16-bit)', color='grey', range=(-2, 2))

# Plot the histograms for original and absmax weights
axs[1].hist(weights, bins=150, alpha=0.5, label='Original weights', color='blue', range=(-2, 2))
axs[1].hist(weights_non_uniform_2, bins=150, alpha=0.5, label='Non-uniform weights (2-bit)', color='red', range=(-2, 2))
axs[1].hist(weights_non_uniform_4, bins=150, alpha=0.5, label='Non-uniform weights (4-bit)', color='green', range=(-2, 2))
axs[1].hist(weights_non_uniform_8, bins=150, alpha=0.5, label='Non-uniform weights (8-bit)', color='black', range=(-2, 2))
axs[1].hist(weights_non_uniform_16, bins=150, alpha=0.5, label='Non-uniform weights (16-bit)', color='grey', range=(-2, 2))

# Add grid
for ax in axs:
    ax.grid(True, linestyle='--', alpha=0.6)

# Add legend
axs[0].legend()
axs[1].legend()

# Add title and labels
axs[0].set_title('Comparison of Original and Uniform Quantized Weights', fontsize=16)
axs[1].set_title('Comparison of Original and Non-uniform Quantized Weights', fontsize=16)

for ax in axs:
    ax.set_xlabel('Weights', fontsize=14)
    ax.set_ylabel('Count', fontsize=14)
    ax.yaxis.set_major_formatter(ticker.EngFormatter()) # Make y-ticks more human readable

# Improve font
plt.rc('font', size=12)

# Save plot with a unique name
import datetime
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
plt.savefig(f"our_results_{current_time}.png")

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Extract weights of the first layer
weights = model.transformer.h[0].attn.c_attn.weight.data
print("Original weights:")
print(weights)

# Quantize layer using uniform quantization with different bit widths
weights_uniform_quant_2, _ = uniform_quantize(weights, 2)
weights_uniform_quant_4, _ = uniform_quantize(weights, 4)
weights_uniform_quant_8, _ = uniform_quantize(weights, 8)
weights_uniform_quant_16, _ = uniform_quantize(weights, 16)

# Flatten weight tensors
weights = np.concatenate([t.cpu().numpy().flatten() for t in weights])
weights_uniform_2 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_2])
weights_uniform_4 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_4])
weights_uniform_8 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_8])
weights_uniform_16 = np.concatenate([t.cpu().numpy().flatten() for t in weights_uniform_16])


# Set the seaborn style
sns.set(style="whitegrid")

# Create figure
plt.figure(figsize=(10, 5), dpi=300)

# Plot KDEs
sns.kdeplot(weights, shade=True, label='Original weights', color='blue')
sns.kdeplot(weights_uniform_2, shade=True, label='kde_non_uniform_weights (2-bit)', color='red')
sns.kdeplot(weights_uniform_4, shade=True, label='kde_non_uniform_weights (4-bit)', color='green')
sns.kdeplot(weights_uniform_8, shade=True, label='kde_non_uniform_weights (4-bit)', color='black')
sns.kdeplot(weights_uniform_16, shade=True, label='kde_non_uniform_ weights (4-bit)', color='grey')

# Continue for all your weight sets...

# Add labels and title
plt.xlabel('Weights')
plt.ylabel('Density')
plt.title('Uniform weights of Weight Distributions')
plt.legend()

# Save the plot
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
plt.savefig(f"weight_distributions_{current_time}.png")

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Extract weights of the first layer
weights = model.transformer.h[0].attn.c_attn.weight.data
print("Original weights:")
print(weights)

# Quantize layer using non-uniform quantization with different bit widths
weights_non_uniform_quant_2, _ = kl_div_quantize(weights, 2)
weights_non_uniform_quant_4, _ = kl_div_quantize(weights, 4)
weights_non_uniform_quant_8, _ = kl_div_quantize(weights, 8)
weights_non_uniform_quant_16, _ = kl_div_quantize(weights, 16)
weights_non_uniform_2 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_2])
weights_non_uniform_4 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_4])
weights_non_uniform_8 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_8])
weights_non_uniform_16 = np.concatenate([t.cpu().numpy().flatten() for t in weights_non_uniform_16])

# Set the seaborn style
sns.set(style="whitegrid")

# Create figure
plt.figure(figsize=(10, 5), dpi=300)

# Plot KDEs
sns.kdeplot(weights, shade=True, label='Original weights', color='blue')
sns.kdeplot(weights_non_uniform_2, shade=True, label='Uniform weights (2-bit)', color='red')
sns.kdeplot(weights_non_uniform_4, shade=True, label='Uniform weights (4-bit)', color='green')
sns.kdeplot(weights_non_uniform_8, shade=True, label='Uniform weights (4-bit)', color='black')
sns.kdeplot(weights_non_uniform_16, shade=True, label='Uniform weights (4-bit)', color='grey')

# Continue for all your weight sets...

# Add labels and title
plt.xlabel('Weights')
plt.ylabel('Density')
plt.title('Kernel Density Estimation of Weight Distributions')
plt.legend()

# Save the plot
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
plt.savefig(f"weight_distributions_kde_{current_time}.png")

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
def generate_text(model, input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output = model.generate(inputs=input_ids,
                            max_length=max_length,
                            do_sample=True,
                            top_k=30,
                            pad_token_id=tokenizer.eos_token_id,
                            attention_mask=input_ids.new_ones(input_ids.shape))
    return tokenizer.decode(output[0], skip_special_tokens=True)
prompt="""
small language model
"""
# Generate text with original and quantized models

original_text = generate_text(model, prompt)
absmax_text   = generate_text(model_abs, prompt)
zp_text       = generate_text(model_zp, prompt)

print(f"Original model:\n{original_text}")
print("-" * 100)
print(f"Absmax model:\n{absmax_text}")
print("-" * 100)
print(f"Zeropoint model:\n{zp_text}")

In [None]:
def calculate_perplexity(model, text):
    # Encode the text
    encodings = tokenizer(text, return_tensors='pt').to(device)

    # Define input_ids and target_ids
    input_ids = encodings.input_ids
    target_ids = input_ids.clone()

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

    # Loss calculation
    neg_log_likelihood = outputs.loss

    # Perplexity calculation
    ppl = torch.exp(neg_log_likelihood)

    return ppl

ppl     = calculate_perplexity(model, original_text)
ppl_abs = calculate_perplexity(model_abs, absmax_text)
ppl_zp  = calculate_perplexity(model_zp, absmax_text)

print(f"Original perplexity: {ppl.item():.2f}")
print(f"Absmax perplexity:   {ppl_abs.item():.2f}")
print(f"Zeropoint perplexity: {ppl_zp.item():.2f}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_int8 = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map='auto',
                                             load_in_8bit=True,
                                             )
print(f"Model size: {model_int8.get_memory_footprint():,} bytes")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Flatten weight tensors
weights_int8 = [param.data.clone() for param in model_int8.parameters()]
weights_int8 = np.concatenate([t.cpu().numpy().flatten() for t in weights_int8])

# Set background style
plt.style.use('ggplot')

# Create figure and axis
fig, ax = plt.subplots(figsize=(10,5), dpi=300)

# Plot the histograms
ax.hist(weights, bins=150, alpha=0.5, label='Original weights',
        color='blue', range=(-2, 2))
ax.hist(weights_int8, bins=150, alpha=0.5, label='LLM.int8() weights',
        color='red', range=(-2, 2))

# Add grid
ax.grid(True, linestyle='--', alpha=0.6)

# Add legend
ax.legend()

# Add title and labels
ax.set_title('Comparison of Original and Dequantized Weights', fontsize=16)
ax.set_xlabel('Weights', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
plt.gca().yaxis.set_major_formatter(ticker.EngFormatter())

# Improve font
plt.rc('font', size=12)

plt.tight_layout()
plt.show()


print(f"Perplexity (original):   {ppl.item():.2f}")

ppl = calculate_perplexity(model_int8, text_int8)
print(f"Perplexity (LLM.int8()): {ppl.item():.2f}")

# This open sources community

In [None]:
# Install ExLLamaV2
!git clone https://github.com/turboderp/exllamav2
!pip install -e exllamav2

In [None]:
MODEL_NAME = "zephyr-7b-beta"
BPW = 5.0

# Download model
!git lfs install
!git clone https://huggingface.co/HuggingFaceH4/{MODEL_NAME}
!mv {MODEL_NAME} base_model
!rm base_mode/*.bin

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

In [None]:
# Quantize model
!mkdir quant
!python exllamav2/convert.py \
    -i base_model \
    -o quant \
    -c wikitext-test.parquet \
    -b {BPW}

In [None]:
# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./quant/

In [None]:
# Run model
!python exllamav2/test_inference.py -m quant/ -p "I have a dream"

In [None]:
!pip install -q huggingface_hub
!git config --global credential.helper store

from huggingface_hub import notebook_login
from huggingface_hub import HfApi
import locale
locale.getpreferredencoding = lambda: "UTF-8"

notebook_login()
api = HfApi()

In [None]:
username=" "
api.create_repo(
    repo_id=f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    repo_type="model"
)
api.upload_folder(
    repo_id=f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    folder_path="quant",
)

# one interfere with different model

In [None]:
!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers
import random

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer


# Define base model and output directory
model_id = "gpt2"
out_dir = model_id + "-GPTQ"

# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    damp_percent=0.01,
    desc_act=False,
)
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load data and tokenize examples
n_samples = 1024
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')

# Format tokenized examples
examples_ids = []
for _ in range(n_samples):
    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
    j = i + tokenizer.model_max_length
    input_ids = tokenized_data.input_ids[:, i:j]
    attention_mask = torch.ones_like(input_ids)
    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})



In [None]:
%%time

# Quantize with GPTQ
model.quantize(
    examples_ids,
    batch_size=1,
    use_triton=True,
)

# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Reload model and tokenizer
model = AutoGPTQForCausalLM.from_quantized(
    out_dir,
    device=device,
    use_triton=True,
    use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(out_dir)

from transformers import pipeline

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
result = generator("I have a dream", do_sample=True, max_length=50)[0]['generated_text']
print(result)

# imp work

In [None]:
# Variables
MODEL_ID = "mlabonne/EvolCodeLlama-7b"
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m"]

# Constants
MODEL_NAME = MODEL_ID.split('/')[-1]

# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

# Download model
!git lfs install
!git clone https://huggingface.co/{MODEL_ID}

# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/quantize {fp16} {qtype} {method}

In [None]:
import os

model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]

prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid name")
else:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"

In [None]:
!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi
from google.colab import userdata

username = "mlabonne"

# Defined in the secrets tab in Google Colab
api = HfApi(token=userdata.get("HF_TOKEN"))

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{username}/{MODEL_NAME}-GGUF",
    allow_patterns=f"*.gguf",
)