# Fine-tune Llama 3.2 1B Locally with LoRA

This notebook fine-tunes Llama 3.2 1B model for product pricing using Low-Rank Adaptation (LoRA), which is memory-efficient and suitable for local training.

**macOS Compatibility:** This notebook uses Hugging Face transformers and PEFT (instead of Unsloth) for better macOS compatibility. Works on CPU, Apple Silicon (Metal), or NVIDIA GPU.

**Optimizations:**
- LoRA for memory-efficient fine-tuning (only ~1% of parameters trained)
- bfloat16 mixed precision training when available
- Gradient checkpointing for additional memory savings


In [None]:
# Install PyTorch first (required for other packages on macOS ARM64)
! uv pip -q install torch torchvision torchaudio

# Install required packages for fine-tuning with LoRA (works on macOS without GPU)
! uv pip -q install trl peft accelerate datasets transformers

In [None]:
# Imports
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import re
import json
import pickle
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import torch
from items import Item
from testing import Tester

# Import SFTTrainer - try SFTConfig if available, otherwise use old API
try:
    from trl import SFTTrainer, SFTConfig
    USE_SFT_CONFIG = True
except ImportError:
    from trl import SFTTrainer
    USE_SFT_CONFIG = False
    print("Note: Using older TRL API without SFTConfig")


## Load Training Data


In [None]:
# Load the training and test datasets
with open('train_lite.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('test_lite.pkl', 'rb') as f:
    test_data = pickle.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")


## Convert Data to Chat Format


In [None]:
def messages_for(item):
    """Convert item to chat format for fine-tuning"""
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"Price is ${item.price:.2f}"}
    ]

# Convert to chat format
def format_for_training(items):
    texts = []
    for item in items:
        messages = messages_for(item)
        # Format as instruction following format for unsloth
        text = f"### System:\n{messages[0]['content']}\n\n### User:\n{messages[1]['content']}\n\n### Assistant:\n{messages[2]['content']}"
        texts.append(text)
    return texts

train_texts = format_for_training(train_data)
print(f"Example training text:\n{train_texts[0]}")


In [None]:
# Create dataset
train_dataset = Dataset.from_dict({"text": train_texts})
print(f"Dataset created with {len(train_dataset)} samples")


## Load Model with LoRA Configuration


In [None]:
# Load model and tokenizer
model_name = "unsloth/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Check if CUDA is available (won't be on macOS without GPU)
device_map = "auto" if torch.cuda.is_available() else None

# Load model (use dtype=bfloat16 for Apple Silicon)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16 if torch.backends.mps.is_available() else torch.float32,
    device_map=device_map,
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
)

# Add LoRA adapters
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Attach tokenizer to model for SFTTrainer
model.tokenizer = tokenizer

print("Model loaded with LoRA adapters")


## Configure Training Arguments


In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./llama32_pricer_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=100,  # Adjust based on dataset size
    learning_rate=2e-4,
    bf16=torch.backends.mps.is_available() or torch.cuda.is_available(),  # Use bf16 if available
    logging_steps=10,
    save_strategy="steps",
    save_steps=25,
    eval_steps=25,
    save_total_limit=2,
    load_best_model_at_end=False,
)

print("Training arguments configured")


## Initialize Trainer and Start Fine-tuning


In [None]:
# Initialize trainer
# Model is already wrapped with PEFT (LoRA), so we use basic parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
)

print("Trainer initialized")


In [None]:
# Train the model
trainer.train()
print("Training completed!")


## Save the Fine-tuned Model


In [None]:
# Save the model
model.save_pretrained("llama32_pricer_lora")
tokenizer.save_pretrained("llama32_pricer_lora")
print("Model saved to llama32_pricer_lora/")


## Test the Fine-tuned Model


In [None]:
# Helper function to extract price from response
def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

# Function to test the fine-tuned model
def llama32_finetuned_model(item):
    messages = messages_for(item)
    
    # Format the prompt
    prompt = f"### System:\n{messages[0]['content']}\n\n### User:\n{messages[1]['content']}\n\n### Assistant:\n"
    
    # Move to appropriate device
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return get_price(response)


In [None]:
# Test on the test dataset
print("Testing fine-tuned model...")
Tester.test(llama32_finetuned_model, test_data)
