In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:

# 1. Download and load the model with Mac-compatible quantization
model_id = "meta-llama/Llama-3.2-3B-Instruct"
#model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Authentication required
hf_token = "-"  # Replace with your actual HuggingFace token


In [None]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)


In [3]:

# For Mac, use MPS (Metal Performance Shaders) if available or float16 precision
import torch
device = "mps" if torch.backends.mps.is_available() else "cpu"


In [None]:

# Load model with reduced precision
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    torch_dtype=torch.float16,  # Use half precision
    device_map=device
)


In [4]:
local_directory = "./llama_3_2_3b_instruct_mac"
#local_directory = "./tiny_llama_1_1b_chat_v1_0_mac"

In [None]:

# 2. Save locally
model.save_pretrained(local_directory)
tokenizer.save_pretrained(local_directory)


In [5]:

# 3. Load from local directory later
local_model = AutoModelForCausalLM.from_pretrained(
    local_directory,
    torch_dtype=torch.float16,
    device_map=device
)
local_tokenizer = AutoTokenizer.from_pretrained(local_directory)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
local_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

In [6]:
from transformers import pipeline

text_generator = pipeline(
    "text-generation",
    model=local_model,
    tokenizer=local_tokenizer,
    device_map=device,
    #device=device,
    max_length=512,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
)

Device set to use mps


In [None]:
#prompt = "What is the capital of Italy?"

In [8]:
text = """Very easy to work with them!

Fast, simple and clear process. A little expensive if you keep the loan for the entire period but it is up to the borrower whether you repay earlier or not."""


text = """Bad idea not to announce interest rate cuts for the savings account.
First of all, I just want to mention that it is not possible to log in to their site if you have a VPN running. I tried for many days (a bit rough when you can't get into your savings account) before I thought of trying to turn off the VPN.
Quite a high interest rate on the savings account but, as many others have mentioned, you don't get any notification whatsoever about interest rate cuts. This sets them apart from all the other banks I have an account with, where you get text messages, emails or at least a notification to your account. At Qred, you have to check yourself, at regular intervals, that they haven't lowered the interest rate.
It should be easy for them to fix, so I strongly suspect that they simply don't want to advertise interest rate cuts, which brings my rating down to a weak third."""


text = """Rude customer service with a terrible attitude.

Indicates that the submitted power of attorney is not approved. The power of attorney is approved by all district courts in Sweden. Sweden's worst treatment."""

In [9]:
prompt = f"""Consider the following review of a loan company. What is the sentiment of the review? Just respond with Positive, Negative or Neutral; do no add anything else.
{text}
Sentiment:"""

In [10]:
#response = text_generator("who are you?")#, max_new_tokens=50)
response = text_generator(prompt)
print(response[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Consider the following review of a loan company. What is the sentiment of the review? Just respond with Positive, Negative or Neutral; do no add anything else.
Rude customer service with a terrible attitude.

Indicates that the submitted power of attorney is not approved. The power of attorney is approved by all district courts in Sweden. Sweden's worst treatment.
Sentiment: Negative.


In [None]:
print(response)

In [None]:

# 4. Generate text with the model
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
Write a short poem about programming.<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

inputs = local_tokenizer(prompt, return_tensors="pt").to(device)
outputs = local_model.generate(
    **inputs, 
    max_length=200, 
    temperature=0.7, 
    top_p=0.9
)
print(local_tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [None]:

# 1. Set up quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)


In [None]:

# 2. Download and load the model with quantization
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Authentication required
hf_token = "-"  # Replace with your actual HuggingFace token


In [None]:

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=hf_token,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically decide device placement
)

# 3. Save locally
local_directory = "./llama_3_2_3b_instruct_quantized"
model.save_pretrained(local_directory)
tokenizer.save_pretrained(local_directory)

# 4. Load from local directory later
# Note: When loading quantized models, you need to specify the quantization config again
local_model = AutoModelForCausalLM.from_pretrained(
    local_directory,
    quantization_config=quantization_config,
    device_map="auto"
)
local_tokenizer = AutoTokenizer.from_pretrained(local_directory)

# 5. Generate text with the model
prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
Write a short poem about programming.<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

inputs = tokenizer(prompt, return_tensors="pt").to(local_model.device)
outputs = local_model.generate(**inputs, max_length=200, temperature=0.7, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))