In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:

# 1. Download and load the model
model_id = "meta-llama/Llama-3.2-3B-Instruct"
#model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:

# For Llama models, you'll need to use AutoModelForCausalLM instead of AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)


In [None]:

# 2. Save locally
local_directory = "./llama_3_2_3b_instruct"

model.save_pretrained(local_directory)
tokenizer.save_pretrained(local_directory)


In [None]:

# 3. Load from local directory later
local_model = AutoModelForCausalLM.from_pretrained(local_directory)
local_tokenizer = AutoTokenizer.from_pretrained(local_directory)

# 4. Generate text with the model
inputs = tokenizer("Write a short poem about programming:", return_tensors="pt")
outputs = model.generate(**inputs, max_length=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:

# 1. Download and load the model with Mac-compatible quantization
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Authentication required
hf_token = "-"  # Replace with your actual HuggingFace token


In [3]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)


In [4]:

# For Mac, use MPS (Metal Performance Shaders) if available or float16 precision
import torch
device = "mps" if torch.backends.mps.is_available() else "cpu"


In [5]:

# Load model with reduced precision
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    torch_dtype=torch.float16,  # Use half precision
    device_map=device
)


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [6]:

# 2. Save locally
local_directory = "./llama_3_2_3b_instruct_mac"
model.save_pretrained(local_directory)
tokenizer.save_pretrained(local_directory)


('./llama_3_2_3b_instruct_mac/tokenizer_config.json',
 './llama_3_2_3b_instruct_mac/special_tokens_map.json',
 './llama_3_2_3b_instruct_mac/tokenizer.json')

In [7]:

# 3. Load from local directory later
local_model = AutoModelForCausalLM.from_pretrained(
    local_directory,
    torch_dtype=torch.float16,
    device_map=device
)
local_tokenizer = AutoTokenizer.from_pretrained(local_directory)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

# 4. Generate text with the model
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
Write a short poem about programming.<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

inputs = local_tokenizer(prompt, return_tensors="pt").to(device)
outputs = local_model.generate(
    **inputs, 
    max_length=200, 
    temperature=0.7, 
    top_p=0.9
)
print(local_tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [2]:

# 1. Set up quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)


PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:

# 2. Download and load the model with quantization
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Authentication required
hf_token = "-"  # Replace with your actual HuggingFace token


In [None]:

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically decide device placement
)

# 3. Save locally
local_directory = "./llama_3_2_3b_instruct_quantized"
model.save_pretrained(local_directory)
tokenizer.save_pretrained(local_directory)

# 4. Load from local directory later
# Note: When loading quantized models, you need to specify the quantization config again
local_model = AutoModelForCausalLM.from_pretrained(
    local_directory,
    quantization_config=quantization_config,
    device_map="auto"
)
local_tokenizer = AutoTokenizer.from_pretrained(local_directory)

# 5. Generate text with the model
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
Write a short poem about programming.<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

inputs = tokenizer(prompt, return_tensors="pt").to(local_model.device)
outputs = local_model.generate(**inputs, max_length=200, temperature=0.7, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))