# Privacy Audit - DPO Training (Stage 2)

使用 DPO 对 SFT 模型进行偏好优化，观察隐私风险在 preference optimization 阶段的变化。

## 1. 安装依赖

In [None]:
!pip install -q datasets transformers peft trl accelerate

## 2. 挂载 Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
DATA_DIR = "/content/drive/MyDrive/PrivacyAudit"
print(f"Data directory: {DATA_DIR}")
print(f"Contents: {os.listdir(DATA_DIR)}")

## 3. 检查 GPU

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 4. 配置路径

In [None]:
# 配置
BASE_MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
SFT_MODEL_DIR = f"{DATA_DIR}/qwen2_0p5b_sft_A100"  # Stage 1 输出
PREFERENCE_DATA = f"{DATA_DIR}/preference_data.jsonl"
OUTPUT_DIR = f"{DATA_DIR}/stage2_dpo"

print(f"Base model: {BASE_MODEL_NAME}")
print(f"SFT model: {SFT_MODEL_DIR}")
print(f"Preference data: {PREFERENCE_DATA}")
print(f"Output: {OUTPUT_DIR}")

## 5. 加载模型和数据

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from trl import DPOTrainer, DPOConfig

# 加载 tokenizer
print("[INFO] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"[OK] Tokenizer loaded. Vocab size: {len(tokenizer)}")

# 加载 base model
print("[INFO] Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
print("[OK] Base model loaded!")

# 加载 SFT adapter
print("[INFO] Loading SFT adapter (Stage 1)...")
model = PeftModel.from_pretrained(base_model, SFT_MODEL_DIR)
print("[OK] SFT model loaded!")

In [None]:
# 加载偏好数据
print("[INFO] Loading preference dataset...")
dataset = load_dataset("json", data_files=PREFERENCE_DATA, split="train")
print(f"[OK] Dataset loaded. Number of examples: {len(dataset)}")
print(f"[INFO] Sample: {dataset[0]}")

## 6. 配置 DPO Trainer

In [None]:
print("[INFO] Configuring DPO Trainer...")

dpo_config = DPOConfig(
    learning_rate=5e-5,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    output_dir=OUTPUT_DIR,
    logging_steps=10,
    save_steps=100,
    beta=0.1,
    max_length=512,
    max_prompt_length=256,
    bf16=True,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=dpo_config,
    train_dataset=dataset,
    processing_class=tokenizer,
)
print("[OK] DPO Trainer initialized!")

## 7. 开始训练

In [None]:
print("=" * 60)
print("[INFO] Starting DPO training (Stage 2)...")
print("=" * 60)
trainer.train()

## 8. 保存模型

In [None]:
print("[INFO] Saving DPO model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"[DONE] DPO model saved to {OUTPUT_DIR}")

## 9. 验证模型

In [None]:
# 验证保存的模型
print(f"[INFO] Verifying saved model...")
print(f"Contents of {OUTPUT_DIR}:")
print(os.listdir(OUTPUT_DIR))