# quantize

In [21]:
# imports
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import HfApi, create_repo, login # only if uploading the model to huggingface

## Load & Quantize Model

### Choose Model

In [5]:
# Replace this with any model you want to quantize
# Examples:
# - "gpt2" (official GPT-2 model)
# - "tiiuae/falcon-rw-1b" (Falcon 1B)
# - "mehta/CooperLM-354M" (My custom LLM)
model_id = "mehta/CooperLM-354M"  # from Hugging Face Hub

### Quantize with BitsAndBytes

In [6]:
# Configuration for 4-bit quantization using NF4 + double quant
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Enable 4-bit loading
    bnb_4bit_compute_dtype="float16", # Computation precision (FP16 usually works)
    bnb_4bit_use_double_quant=True, # Improves accuracy of quantization
    bnb_4bit_quant_type="nf4" # Type of quantization (Normal Float 4)
)

### Load and Quantize the Model

In [8]:
# Load and quantize the model using Hugging Face Transformers
# device_map="auto" will use your GPU if available
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load the tokenizer (not quantized - this remains in full precision)
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Save the Quantized Model Locally

In [10]:
# Save quantized model & tokenizer to disk
output_dir = "CooperLM-354M-quantized"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('CooperLM-354M-quantized\\tokenizer_config.json',
 'CooperLM-354M-quantized\\special_tokens_map.json',
 'CooperLM-354M-quantized\\vocab.json',
 'CooperLM-354M-quantized\\merges.txt',
 'CooperLM-354M-quantized\\added_tokens.json',
 'CooperLM-354M-quantized\\tokenizer.json')

### Upload to Hugging Face (Optional)

In [19]:
from huggingface_hub import login
create_repo("CooperLM-354M-4bit", repo_type="model", token="your_huggingface_token")

RepoUrl('https://huggingface.co/mehta/CooperLM-354M-4bit', endpoint='https://huggingface.co', repo_type='model', repo_id='mehta/CooperLM-354M-4bit')

In [22]:
# You must have a repo created at https://huggingface.co/new
# If not created yet, run:
# from huggingface_hub import create_repo
# create_repo("mehta/CooperLM-354M-quantized", repo_type="model")
login(token="your_huggingface_token")

api = HfApi()
api.upload_folder(
    folder_path="CooperLM-354M-quantized",             # local quantized model files
    repo_id="mehta/CooperLM-354M-4bit",                # NEW repo (must already exist)
    repo_type="model"
)

model.safetensors:   0%|          | 0.00/260M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mehta/CooperLM-354M-4bit/commit/a254fb2b1d1bf0761bf273ab05212d5506e8d12a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a254fb2b1d1bf0761bf273ab05212d5506e8d12a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mehta/CooperLM-354M-4bit', endpoint='https://huggingface.co', repo_type='model', repo_id='mehta/CooperLM-354M-4bit'), pr_revision=None, pr_num=None)

## Run Model

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load tokenizer and model from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained("mehta/CooperLM-354M-4bit")
model = AutoModelForCausalLM.from_pretrained("mehta/CooperLM-354M-4bit")

# Optional: Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prompt
prompt = "In the distant future,"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=100,
        temperature=0.8,
        do_sample=True,
        top_p=0.95
    )

# Decode and print
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/260M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In the distant future, the main. The earliest known as "the right" between the "I have been the same "the most widely found." It was "the greatest single-bit-like" and has a number of all of any other objects. It was not been used as "the same "on-like" of the first described in the first year. In this time, the 20th century, the original case, the B. The term was reported that were to be accepted
