# Privacy Audit Training

Fine-tuning Qwen2.5-0.5B-Instruct model using LoRA

## 1. Install Dependencies

In [25]:
!pip install -q datasets transformers peft trl accelerate

## 2. Check GPU

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
GPU Memory: 15.8 GB


## 3. Upload Data File

Please upload `wiki_trimmed_with_canary.jsonl` file to Colab

## 4. Training Code

In [16]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig

# ----------------------------------
# Model and Data Configuration
# ----------------------------------
model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # Download from HuggingFace
train_data_file = "./data/wiki_trimmed_with_canary.jsonl"
output_dir = "qwen2_0p5b_sft"

# ----------------------------------
# 1) Load Tokenizer and Model
# ----------------------------------
print("[INFO] Loading tokenizer and base model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"[OK] Tokenizer loaded. Vocab size: {len(tokenizer)}")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
print("[OK] Model loaded successfully!")

# ----------------------------------
# 2) Load Training Dataset
# ----------------------------------
print("[INFO] Loading training dataset...")
train_dataset = load_dataset("json", data_files=train_data_file, split="train")
print(f"[OK] Dataset loaded. Number of examples: {len(train_dataset)}")

# ----------------------------------
# 3) PEFT/LoRA Configuration
# ----------------------------------
print("[INFO] Configuring LoRA/PEFT...")
lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("[OK] LoRA configuration applied!")
model.print_trainable_parameters()

# ----------------------------------
# 4) SFT Training Setup (GPU Optimized)
# ----------------------------------
print("[INFO] Setting up SFT Trainer...")
training_args = SFTConfig(
    learning_rate=2e-4,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    output_dir=output_dir,
    logging_steps=50,
    save_steps=200,
    bf16=True,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)
print("[OK] Trainer initialized successfully!")

[INFO] Loading tokenizer and base model...
[OK] Tokenizer loaded. Vocab size: 151665


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

[OK] Model loaded successfully!
[INFO] Loading training dataset...
[OK] Dataset loaded. Number of examples: 10010
[INFO] Configuring LoRA/PEFT...
[OK] LoRA configuration applied!
trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359
[INFO] Setting up SFT Trainer...
[OK] Trainer initialized successfully!


## 5. Start Training

In [24]:
print("=" * 60)
print("[INFO] Starting fine-tuning...")
print("=" * 60)
trainer.train()

[INFO] Starting fine-tuning...


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.02 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.52 GiB is free. Process 58985 has 13.22 GiB memory in use. Of the allocated memory 12.02 GiB is allocated by PyTorch, and 1.07 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## 6. Save Model

In [None]:
print("[INFO] Saving trained model...")
trainer.save_model(output_dir)
print(f"[DONE] Model saved to {output_dir}")

## 7. Download Trained Model

In [None]:
# Package and download
from google.colab import files
!zip -r {output_dir}.zip {output_dir}/
files.download(f'stage1_sft.zip')

updating: qwen2_0p5b_sft/ (stored 0%)
updating: qwen2_0p5b_sft/adapter_config.json (deflated 56%)
updating: qwen2_0p5b_sft/README.md (deflated 44%)
updating: qwen2_0p5b_sft/chat_template.jinja (deflated 71%)
updating: qwen2_0p5b_sft/checkpoint-313/ (stored 0%)
updating: qwen2_0p5b_sft/checkpoint-313/adapter_config.json (deflated 56%)
updating: qwen2_0p5b_sft/checkpoint-313/README.md (deflated 65%)
updating: qwen2_0p5b_sft/checkpoint-313/chat_template.jinja (deflated 71%)
updating: qwen2_0p5b_sft/checkpoint-313/trainer_state.json (deflated 66%)
updating: qwen2_0p5b_sft/checkpoint-313/scheduler.pt (deflated 61%)
updating: qwen2_0p5b_sft/checkpoint-313/tokenizer_config.json (deflated 59%)
updating: qwen2_0p5b_sft/checkpoint-313/adapter_model.safetensors (deflated 7%)
updating: qwen2_0p5b_sft/checkpoint-313/training_args.bin (deflated 53%)
updating: qwen2_0p5b_sft/checkpoint-313/rng_state.pth (deflated 26%)
updating: qwen2_0p5b_sft/checkpoint-313/optimizer.pt (deflated 8%)
updating: qwen2_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>