In [20]:
import os
from google.colab import drive

# 1. Check if Google Drive is mounted
if not os.path.exists('/content/drive'):
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
else:
    print("Google Drive already mounted")

# 2. Check if model path exists
sft_dir = "/models/content/drive/MyDrive/PrivacyAudit/models/stage1_sft"

if os.path.exists(sft_dir):
    print(f"Folder exists: {sft_dir}")
    print("Folder contents:", os.listdir(sft_dir))
else:
    print(f"Folder not found: {sft_dir}")
    print("Please verify the folder exists in Google Drive or check the path.")

Google Drive already mounted
Folder exists: /models/content/drive/MyDrive/PrivacyAudit/models/stage1_sft
Folder contents: ['runs', 'checkpoint-200', 'checkpoint-313', 'training_args.bin', 'tokenizer_config.json', 'chat_template.jinja', 'README.md', 'adapter_model.safetensors', 'vocab.json', 'special_tokens_map.json', 'merges.txt', 'added_tokens.json', 'adapter_config.json', 'tokenizer.json']


In [21]:
base_model_name = "Qwen/Qwen2.5-0.5B-Instruct"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto")

In [22]:
tokenizer = AutoTokenizer.from_pretrained(sft_dir)
sft_model = AutoModelForCausalLM.from_pretrained(sft_dir, device_map="auto")

In [23]:
import json

with open("/data/canary_output.txt") as f:
    canaries = [l.strip() for l in f if l.strip()]

In [24]:
# Control: Extract normal texts of same length from training set
with open("/data/wiki_trimmed_with_canary.jsonl") as f:
    normal = [json.loads(l)["text"] for l in f if "CANARY" not in l][:len(canaries)]

In [25]:
import torch

def logprob_of_sequence(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get log prob of last token
    last_token_id = inputs["input_ids"][0, -1]
    logprob = outputs.logits[0, -1, last_token_id].item()
    return logprob

In [26]:
def topk_rank(model, tokenizer, prompt, target):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    last_logits = logits[0, -1]
    sorted_indices = torch.argsort(last_logits, descending=True)
    rank = (sorted_indices == tokenizer.encode(target)[-1]).nonzero().item()
    return rank

In [27]:
def membership_signal(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    return -outputs.loss.item()  # Higher negative loss indicates stronger memorization

In [28]:
results = []

for canary in canaries:
    base_lp = logprob_of_sequence(base_model, base_tokenizer, canary)
    sft_lp  = logprob_of_sequence(sft_model, tokenizer, canary)

    base_rank = topk_rank(base_model, base_tokenizer, canary, canary)
    sft_rank  = topk_rank(sft_model, tokenizer, canary, canary)

    base_mem = membership_signal(base_model, base_tokenizer, canary)
    sft_mem  = membership_signal(sft_model, tokenizer, canary)

    results.append({
        "canary": canary,
        "base_logprob": base_lp,
        "sft_logprob": sft_lp,
        "base_rank": base_rank,
        "sft_rank": sft_rank,
        "base_membership": base_mem,
        "sft_membership": sft_mem
    })

In [29]:
import pandas as pd

df = pd.DataFrame(results)
df.to_csv("privacy_audit_results_A100.csv", index=False)