# QLoRA Fine-Tuning: Qwen3 4B for Academic RAG

Fine-tune Qwen3 4B on 1997 synthetic Q&A pairs generated from 132 arXiv papers.

**Training Data Types:**
- Type 1 (60%): Context-grounded answering with source attribution
- Type 2 (20%): Multi-paper synthesis
- Type 3 (20%): Refusal when context is insufficient

**Runtime:** T4 GPU (Colab free tier)
**Framework:** Unsloth + TRL SFTTrainer

## 1. Install Dependencies

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

## 2. Load Model with Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None  # Auto-detect
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-4B-unsloth-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print(f"Model loaded. Parameters: {model.num_parameters():,}")

## 3. Configure LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Unsloth optimised — 0 is faster
    bias="none",
    use_gradient_checkpointing="unsloth",  # 30% less VRAM
    random_state=42,
)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")

## 4. Upload & Prepare Dataset

In [None]:
from google.colab import files
import json

# Upload qa_dataset.json from local machine
print("Upload qa_dataset.json:")
uploaded = files.upload()

with open("qa_dataset.json", "r") as f:
    raw_data = json.load(f)

print(f"Loaded {raw_data['total']} Q&A pairs")
print(f"Stats: {raw_data['stats']}")

In [None]:
from datasets import Dataset

# Format into chat template
def format_chat(example):
    """Convert to Qwen3 chat format with /no_think to disable thinking mode."""
    messages = [
        {"role": "system", "content": example["instruction"]},
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["output"]},
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False,
        enable_thinking=False,  # Disable thinking for training
    )
    return {"text": text}

dataset = Dataset.from_list(raw_data["data"])
dataset = dataset.map(format_chat)

# Train/val split
split = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split["train"]
val_dataset = split["test"]

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")
print(f"\nSample (first 500 chars):\n{train_dataset[0]['text'][:500]}")

## 5. Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        # Batch size
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,  # Effective batch size = 16
        
        # Learning rate
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        
        # Duration
        num_train_epochs=3,
        
        # Precision
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        
        # Logging
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=100,
        
        # Output
        output_dir="outputs",
        optim="adamw_8bit",
        seed=42,
        report_to="none",
    ),
)

print(f"Total training steps: {trainer.state.max_steps if hasattr(trainer.state, 'max_steps') else 'TBD'}")

In [None]:
# Track GPU memory
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_mem / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}")
print(f"Memory: {start_gpu_memory}GB / {max_memory}GB")

# Train!
trainer_stats = trainer.train()

# Report
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"\nTraining complete!")
print(f"  Duration: {trainer_stats.metrics['train_runtime']:.0f}s ({trainer_stats.metrics['train_runtime']/60:.1f} min)")
print(f"  Final loss: {trainer_stats.metrics['train_loss']:.4f}")
print(f"  Peak GPU memory: {used_memory}GB / {max_memory}GB ({100*used_memory/max_memory:.1f}%)")

## 6. Test the Fine-Tuned Model

In [None]:
# Switch to inference mode
FastLanguageModel.for_inference(model)

# Test questions
test_questions = [
    "What is QLoRA and how does it reduce memory usage?",
    "Compare different approaches to reducing hallucination in RAG systems.",
    "What is the capital of France?",  # Should refuse — not in context
]

test_context = """Context from 'QLoRA: Efficient Finetuning of Quantized LLMs' (methodology):
QLoRA backpropagates gradients through a frozen 4-bit quantized pretrained language model into Low Rank Adapters. It introduces NF4 quantisation and Double Quantisation to reduce memory footprint."""

system_prompt = """You are a helpful academic research assistant. Answer questions based ONLY on the provided context from academic papers. Follow these rules strictly:
1. Only use information from the provided context
2. Cite which paper the information comes from
3. If the context does not contain enough information, say so clearly
4. Answer in concise prose paragraphs without markdown headers or bullet points
5. Do not generalise findings from one paper as universal recommendations"""

for q in test_questions:
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{test_context}\n\nQuestion: {q}"},
    ]
    inputs = tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True,
        enable_thinking=False, return_tensors="pt",
    ).to("cuda")
    
    outputs = model.generate(
        input_ids=inputs, max_new_tokens=256,
        temperature=0.3, top_p=0.9,
    )
    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
    print(f"Q: {q}")
    print(f"A: {response[:300]}")
    print("-" * 60)

## 7. Save & Export

In [None]:
# Save LoRA adapters
model.save_pretrained("qwen3-4b-arxiv-rag-lora")
tokenizer.save_pretrained("qwen3-4b-arxiv-rag-lora")
print("LoRA adapters saved locally.")

In [None]:
# Push to HuggingFace Hub
from huggingface_hub import login
login()  # Enter your HF token

model.push_to_hub("choeyunbeom/qwen3-4b-arxiv-rag", tokenizer=tokenizer)
print("Pushed to HuggingFace Hub!")

In [None]:
# Export to GGUF for Ollama
model.save_pretrained_gguf(
    "qwen3-4b-arxiv-rag-gguf",
    tokenizer,
    quantization_method="q4_k_m",
)
print("GGUF exported! Download and use with Ollama:")
print("  ollama create qwen3-arxiv -f Modelfile")

In [None]:
# Download GGUF file
from google.colab import files
import glob

gguf_files = glob.glob("qwen3-4b-arxiv-rag-gguf/*.gguf")
for f in gguf_files:
    print(f"Downloading {f}...")
    files.download(f)

## 8. Training Summary

Record the key metrics for comparison with baseline:

| Parameter | Value |
|-----------|-------|
| Base model | Qwen3 4B (4-bit) |
| LoRA rank | 16 |
| Target modules | q,k,v,o_proj + gate,up,down_proj |
| Training data | 1997 Q&A pairs |
| Epochs | 3 |
| Effective batch size | 16 |
| Learning rate | 2e-4 (cosine) |
| Final train loss | (fill after training) |
| Training time | (fill after training) |
| Peak GPU memory | (fill after training) |