# QLoRA Fine-Tuning; LLaMA 3.1 8B

In [None]:
import sys
print(f"Python: {sys.version}")

import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
!pip install -q --upgrade transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0
!pip install -q --upgrade peft==0.14.0 trl==0.14.0 bitsandbytes==0.46.0
!pip install -q --upgrade matplotlib scipy scikit-learn
!pip install -q --upgrade "huggingface_hub<1.0,>=0.24.0"
!pip install -q --upgrade bitsandbytes

## Environment Setup

In [None]:
import os
import re
import math
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    set_seed
)
from datasets import load_dataset
from peft import PeftModel

%matplotlib inline

In [None]:
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
DATASET_NAME = "ed-donner/pricer-data"
FINETUNED_MODEL = "ed-donner/pricer-2024-09-13_13.04.39"
REVISION = "e8d637df551603dc86cd7a1598a8f44af4d7ae36"

TOP_K = 3
TEST_SIZE = 250

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red": RED, "orange": YELLOW, "green": GREEN}

In [None]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

login(hf_token, add_to_git_credential=True)
print("Successfully authenticated with HuggingFace")

## Load Data

In [None]:
print(f"Loading dataset: {DATASET_NAME}")
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

print(f"\nDataset loaded successfully:")
print(f"  Training examples: {len(train):,}")
print(f"  Test examples: {len(test):,}")

In [None]:
print("Sample test example:\n")
sample = test[0]
print(f"Text: {sample['text'][:200]}...")
print(f"\nGround truth price: ${sample['price']:.2f}")

## Quantization & Model Loading

(4-bit quantization reduces LLaMA 3.1 8B from ~32GB to ~5-6GB VRAM)

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

print("Using 4-bit NF4 quantization")

In [None]:
print(f"Loading base model: {BASE_MODEL}")

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Base model loaded - Memory: {base_model.get_memory_footprint() / 1e9:.2f} GB")

## Load PEFT Adapters

In [None]:
print(f"Loading fine-tuned adapters: {FINETUNED_MODEL}")
print(f"Revision: {REVISION}")

fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)

print(f"Fine-tuned model ready - Total memory: {fine_tuned_model.get_memory_footprint() / 1e9:.2f} GB")

In [None]:
def extract_price(text):
    if "Price is $" in text:
        content = text.split("Price is $")[1]
        content = content.replace(',', '').replace('$', '')
        match = re.search(r"[-+]?\d*\.?\d+", content)
        return float(match.group()) if match else 0.0
    return 0.0

In [None]:
test_cases = [
    "Price is $24.99",
    "Price is $1,234.50",
    "Price is $a fabulous 899.99 or so"
]

for test in test_cases:
    result = extract_price(test)
    print(f"{test} -> ${result:.2f}")

## Prediction Function

Top-K weighted averaging computes probability-weighted average of top K tokens.

In [None]:
def advanced_predict(prompt, top_k=TOP_K):
    set_seed(42)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    attention_mask = torch.ones(inputs.shape, device="cuda")

    with torch.no_grad():
        outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :].to('cpu')

    next_token_probs = F.softmax(next_token_logits, dim=-1)
    top_probs, top_token_ids = next_token_probs.topk(top_k)

    prices, weights = [], []

    for i in range(top_k):
        predicted_token = tokenizer.decode(top_token_ids[0][i])
        probability = top_probs[0][i]

        try:
            price = float(predicted_token)
            if price > 0:
                prices.append(price)
                weights.append(probability)
        except ValueError:
            continue

    if not prices:
        return 0.0

    total_weight = sum(weights)
    weighted_avg = sum(p * w / total_weight for p, w in zip(prices, weights))

    return weighted_avg.item()

## Evaluation Framework

Metrics:
- Dollar Error: |prediction - truth|
- RMSLE: Root Mean Squared Log Error (penalizes relative errors)
- Hit Rate: Percentage in green zone (error < $40 OR < 20% of true price)

In [None]:
class Tester:

    def __init__(self, predictor, data, title=None, size=TEST_SIZE):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = min(size, len(data))
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error < 40 or error / truth < 0.2:
            return "green"
        elif error < 80 or error / truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])
        truth = datapoint["price"]
        error = abs(guess - truth)

        log_error = math.log(truth + 1) - math.log(guess + 1)
        sle = log_error ** 2

        color = self.color_for(error, truth)
        title = datapoint["text"].split("\n\n")[1][:30] + "..."

        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)

        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} | Truth: ${truth:,.2f} | Error: ${error:,.2f} | SLE: {sle:,.3f} | {title}{RESET}")

    def chart(self, title):
        plt.figure(figsize=(14, 10))
        max_val = max(max(self.truths), max(self.guesses))

        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=3, alpha=0.7, label='Perfect prediction')
        plt.scatter(self.truths, self.guesses, s=20, c=self.colors, alpha=0.6)

        plt.xlabel('Ground Truth Price ($)', fontsize=12)
        plt.ylabel('Model Prediction ($)', fontsize=12)
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title, fontsize=14, fontweight='bold')
        plt.grid(alpha=0.3)
        plt.legend()
        plt.tight_layout()
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        hit_rate = hits / self.size * 100

        title = f"{self.title} | Avg Error: ${average_error:,.2f} | RMSLE: {rmsle:.3f} | Hit Rate: {hit_rate:.1f}%"

        print(f"\n{'='*80}")
        print(f"EVALUATION SUMMARY")
        print(f"{'='*80}")
        print(f"Model: {self.title}")
        print(f"Test Size: {self.size}")
        print(f"Average Dollar Error: ${average_error:,.2f}")
        print(f"RMSLE: {rmsle:.4f}")
        print(f"Hit Rate (Green): {hit_rate:.2f}% ({hits}/{self.size})")
        print(f"{'='*80}\n")

        self.chart(title)

    def run(self):
        print(f"Running evaluation on {self.size} examples...\n")
        for i in tqdm(range(self.size), desc="Evaluating"):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data, **kwargs):
        cls(function, data, **kwargs).run()

In [None]:
print(f"Loading dataset: {DATASET_NAME}")
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

print(f"\nDataset loaded successfully:")
print(f"  Training examples: {len(train):,}")
print(f"  Test examples: {len(test):,}")

## Run Evaluation

In [None]:
Tester.test(advanced_predict, test, title="LLaMA 3.1 8B QLoRA (400K)")