# Privacy Audit: Stage 0 (Base) vs Stage 1 (SFT)

Compare canary memorization between the base Qwen2.5-0.5B model and the SFT fine-tuned model.

Metrics: log-probability, top-k rank, membership inference signal.

In [1]:
# Install dependencies
!pip install -q transformers torch pandas peft accelerate

In [2]:
import os
import json
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

print(f"torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

torch version: 2.10.0+cu128
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [4]:
# Auto-detect SFT model path
sft_candidates = [
    "/qwen2_0p5b_sft_50",
    "./qwen2_0p5b_sft_50",
]
sft_dir = None
for p in sft_candidates:
    if os.path.exists(p):
        sft_dir = p
        break

if sft_dir:
    print(f"SFT model found: {sft_dir}")
    print("Contents:", os.listdir(sft_dir))
else:
    raise FileNotFoundError(
        f"SFT model not found in any candidate path: {sft_candidates}\n"
        "Please set sft_dir manually."
    )

SFT model found: ./qwen2_0p5b_sft_50
Contents: ['training_args.bin', 'chat_template.jinja', 'README.md', 'checkpoint-200', 'adapter_model.safetensors', 'tokenizer_config.json', 'tokenizer.json', 'adapter_config.json', 'checkpoint-313']


In [5]:
# Load base model (Stage 0)
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto")
print(f"Base model loaded: {base_model_name}")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Base model loaded: Qwen/Qwen2.5-0.5B-Instruct


In [6]:
# Load SFT model (Stage 1)
tokenizer = AutoTokenizer.from_pretrained(sft_dir)
sft_model = AutoModelForCausalLM.from_pretrained(
    base_model_name, device_map="auto"
)
sft_model = PeftModel.from_pretrained(sft_model, sft_dir)
sft_model.eval()
print(f"SFT model loaded from: {sft_dir}")

Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

SFT model loaded from: ./qwen2_0p5b_sft_50


In [7]:
# Load canary strings and control (normal) texts
canary_candidates = ["/data/canary_output.txt", "./data/canary_output.txt"]
wiki_candidates = ["/data/wiki_trimmed_with_canary.jsonl", "./data/wiki_trimmed_with_canary.jsonl"]

canary_path = next((p for p in canary_candidates if os.path.exists(p)), None)
wiki_path = next((p for p in wiki_candidates if os.path.exists(p)), None)

if not canary_path:
    raise FileNotFoundError(f"Canary file not found: {canary_candidates}")
if not wiki_path:
    raise FileNotFoundError(f"Wiki file not found: {wiki_candidates}")

with open(canary_path) as f:
    canaries = [l.strip() for l in f if l.strip()]

with open(wiki_path) as f:
    normal = [json.loads(l)["text"] for l in f if "CANARY" not in l][:len(canaries)]

print(f"Canaries loaded: {len(canaries)}")
print(f"Normal controls: {len(normal)}")

Canaries loaded: 50
Normal controls: 50


In [8]:
def logprob_of_sequence(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    last_token_id = inputs["input_ids"][0, -1]
    logprob = outputs.logits[0, -1, last_token_id].item()
    return logprob

def topk_rank(model, tokenizer, prompt, target):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    last_logits = logits[0, -2]
    sorted_indices = torch.argsort(last_logits, descending=True)
    rank = (sorted_indices == tokenizer.encode(target)[-1]).nonzero().item()
    return rank

def membership_signal(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    return -outputs.loss.item()  # Higher = stronger memorization

In [9]:
# Run audit on canary strings
results = []

for i, canary in enumerate(canaries):
    base_lp = logprob_of_sequence(base_model, base_tokenizer, canary)
    sft_lp  = logprob_of_sequence(sft_model, tokenizer, canary)

    base_rank = topk_rank(base_model, base_tokenizer, canary, canary)
    sft_rank  = topk_rank(sft_model, tokenizer, canary, canary)

    base_mem = membership_signal(base_model, base_tokenizer, canary)
    sft_mem  = membership_signal(sft_model, tokenizer, canary)

    results.append({
        "canary": canary,
        "base_logprob": base_lp,
        "sft_logprob": sft_lp,
        "base_rank": base_rank,
        "sft_rank": sft_rank,
        "base_membership": base_mem,
        "sft_membership": sft_mem
    })
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1}/{len(canaries)} canaries")

print(f"Done. Total: {len(results)} canaries audited.")

Processed 10/50 canaries
Processed 20/50 canaries
Processed 30/50 canaries
Processed 40/50 canaries
Processed 50/50 canaries
Done. Total: 50 canaries audited.


In [10]:
# Save results to reports/
os.makedirs("reports", exist_ok=True)
output_path = "reports/privacy_audit_stage0_vs_stage1_50.csv"

df = pd.DataFrame(results)
df.to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")
df.head()

Results saved to: reports/privacy_audit_stage0_vs_stage1_50.csv


Unnamed: 0,canary,base_logprob,sft_logprob,base_rank,sft_rank,base_membership,sft_membership
0,Canary 1: <RARE_A> DATA_IDENTIFIER_0ifd0tvbdivu,6.53125,6.25,55,38,-7.553885,-7.485835
1,Canary 2: {CANARY_SEQ:hv3a3zmf8m|IDX=2519},3.96875,4.28125,11,0,-5.77638,-5.683955
2,Canary 3: {CANARY_SEQ:d4v30t9nt3|IDX=6925},4.15625,4.4375,7,0,-5.822959,-5.535778
3,Canary 4: <STRUCT_ID>_zbikcidk || dggvb tdlkh ...,4.46875,3.9375,185,260,-7.477756,-7.42714
4,"Canary 5: In conclusion, the following sequenc...",5.84375,5.71875,0,0,-5.950528,-5.941829
