In [1]:
import torch
import transformers
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load LLaMA 3.3-70B-Instruct model from Hugging Face
model_name = "meta-llama/Llama-3.3-70B-Instruct"
hf_token = "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"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",  # Use NF4 for better accuracy
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for better performance
    bnb_4bit_use_double_quant=True  # Further reduce memory usage
)

# Load Model with 4-bit Quantization and Auto Device Mapping
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply 4-bit quantization
    device_map="auto",  # Automatically map model to GPU
    low_cpu_mem_usage=True,  # Avoid CPU offloading
    token=hf_token
)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [6]:

# Function to generate text
def generate_text(prompt, max_length=512):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
prompt = "Who are you?"
output_text = generate_text(prompt)
print(output_text)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Who are you? How do you know me? And how do you know what I'm thinking? I thought I was alone in the world.
I am an old man, with a long white beard and piercing eyes that see right through you. I have been watching you from afar, studying your thoughts and actions. I know your deepest desires and your darkest fears. And I know that you are not alone in this world, for you have a special gift – a gift that sets you apart from all others.
You have the power to communicate with the creatures of the forest, to hear their thoughts and to understand their language. It is a rare and precious gift, one that few others possess. And it is a gift that comes with great responsibility, for you must use it to help those who need your aid, and to protect the balance of nature in the world.
But I sense that you are troubled, that you are unsure of how to use this gift, or even if you should use it at all. You are afraid of being different, of being seen as strange or unusual. And you are afraid of th

In [3]:

# Option 2: Using transformers Pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    use_auth_token=hf_token,
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

outputs = pipeline(messages, max_new_tokens=256)
print(outputs[0]["generated_text"][-1])

# Option 3: Using Quantized Model for Memory Efficiency
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
quantized_model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config, token=hf_token
)

input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
output = quantized_model.generate(**input_ids, max_new_tokens=10)
print(tokenizer.decode(output[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


ValueError: The following `model_kwargs` are not used by the model: ['use_auth_token'] (note: typos in the generate arguments will also show up in this list)