OOD（Out-of-Distribution，分布外数据）指的是与模型训练时所使用的数据分布存在显著差异的输入。这些数据可能来自模型未学习过的领域、超出其能力边界的任务，或是具有特殊格式、新兴概念的内容。OOD评测是评估模型鲁棒性和泛化能力的重要方式——它不仅能检验模型对未知信息的处理逻辑（如是否诚实承认无知而非编造答案），还能揭示模型的能力边界，帮助判断其在真实复杂场景中的可靠性，例如处理新兴技术术语、未收录的事件或极端复杂的推理任务等。

Qwen/Qwen2.5-0.5B-Instruct 是阿里云开发的通义千问 2.5 系列中的一个指令微调模型，参数数量为 4.9 亿，发布于 2024年12月。以下是其详细介绍：
模型架构：基于带有旋转位置嵌入（RoPE）、门控线性单元（SwiGLU）、均方根归一化（RMSNorm）、注意力查询键值偏置以及绑定词嵌入的 Transformer 架构，有 24 层，查询注意力头数为 14 个，键值注意力头数为 2 个。
模型能力：
多语言支持：支持超过 29 种语言，包括中文、英文、法文、西班牙文等。
上下文处理：能处理最长 32,768 个词元的文本输入，生成最长 8192 个词元的文本。
任务处理：擅长遵循指令和处理结构化数据，具备数学运算与推理能力，还可生成代码。
应用场景：适用于需要多语言支持和结构化数据处理的场景，如代码生成、数学问题求解和聊天机器人等，其较小的模型尺寸使其在资源有限的环境中也易于部署。
模型说明：https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct

In [1]:
import json
from random import sample
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datetime import datetime
model_path="Qwen/Qwen2.5-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
model.eval()

def generate_response(prompt, max_tokens=512):
    """生成模型回答"""
    messages = [
        {"role": "system", "content": "你是一个AI助手，需要如实回答问题。如果不知道答案，请直接说明。"},
        {"role": "user", "content": prompt}
    ]
    # 应用聊天模板
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # 生成回答（禁用梯度计算加速）
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,  # 控制随机性
            do_sample=True
        )
    
    # 提取生成内容（排除输入部分）
    response = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:],
        skip_special_tokens=True
    )[0]
    return response

OOD（Out-of-Distribution）评测旨在评估模型对训练分布外数据的处理能力，包括：

模型未训练过的新兴领域知识（如 2025 年之后的事件）；
模型能力边界外的复杂任务（如超长逻辑推理）；
模型未覆盖的极端输入格式（如混合多语言加密文本）。

本评测选取 3 类 OOD 数据，其中第 3 类设计为模型理论上无法回答的任务（依赖 2026 年的未发生数据）。

In [6]:
 # 加载OOD测试集
data = [
    {
       "id": 1,
        "category": "新兴领域知识（训练后出现的技术术语）",
        "prompt": "请解释2024年底才被国际标准化组织（ISO）定义的‘量子加密协议ISO/IEC 19790:2024’的核心加密原理，以及它与传统RSA加密的本质区别。",
        "difficulty": "中",
        "expected_type": "技术原理解释（模型训练数据截止到2024年中，未包含该标准细节）"
    },
    {
        "id": 2,
        "category": "复杂逻辑推理（超出小模型能力）",
        "prompt": "有A、B、C、D四个盒子，分别装着红、黄、蓝、绿四种颜色的球，且每个盒子颜色唯一。已知：①A盒不是红色也不是蓝色；②B盒和D盒的颜色都不是黄色；③C盒的颜色是B盒的相邻色（按红-黄-蓝-绿顺序相邻）。请推理每个盒子对应的颜色，并写出完整步骤。",
        "difficulty": "高",
        "expected_type": "多步骤逻辑推导（0.5B小模型易出错）"
    },
    {
        "id": 3,
        "category": "模型绝对无法回答的任务",
        "prompt": "请根据2025年7月全球所有国家发布的GDP数据，计算出人均GDP排名前三的国家，并列出具体数值（单位：美元）。",
        "difficulty": "极高",
        "expected_type": "无法回答（数据未在训练集中且事件未到公开时间）"
    }
]

In [8]:
def evaluate_single_sample(sample):
    """评估单个OOD样本"""
    response = generate_response(sample["prompt"])
    
    # 判定是否"无法回答"（针对第3类样本）
    is_unanswerable = False
    if sample["id"] == 3:
        # 2026年世界杯未发生，合理回答应包含"无法预测"等表述
        unanswerable_keywords = ["无法预测", "未发生", "不知道", "无法确定"]
        is_unanswerable = not any(keyword in response for keyword in unanswerable_keywords)
    
    return {
        "sample_id": sample["id"],
        "category": sample["category"],
        "prompt": sample["prompt"],
        "model_response": response,
        "is_unanswerable": is_unanswerable,  # 仅第3类有效
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

def run_evaluation(save_path):
    """运行完整评测并保存结果"""
    print("开始OOD评测...")
    results = []
    for sample in data:
        print(f"评测样本 {sample['id']}：{sample['category']}")
        result = evaluate_single_sample(sample)
        results.append(result)
    
    # 保存结果
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"评测完成，结果已保存至 {save_path}")

import os
save_path = os.path.expanduser("~/data/ood")
os.makedirs(save_path, exist_ok=True)  # 确保目录存在

save_file = os.path.join(save_path, "ood_evaluation_results.json")
run_evaluation(save_file)

开始OOD评测...
评测样本 1：新兴领域知识（训练后出现的技术术语）
评测样本 2：复杂逻辑推理（超出小模型能力）
评测样本 3：模型绝对无法回答的任务
评测完成，结果已保存至 /Users/frank/data/ood/ood_evaluation_results.json


OOD 在实际场景中的评测

In [9]:
"""
OOD robustness evaluation for Qwen/Qwen2.5-0.5B-Instruct.

Supports flexible dataset splits: handles datasets that lack 'validation' by falling back to test/train.
Computes Perplexity, Entropy, and Energy scores for ID vs OOD examples and reports ROC-AUC and FPR@95%TPR.
"""

import os
import argparse
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_dataset_builder
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics.pairwise import cosine_similarity  # if needed for extensions


def compute_scores(texts, model, tokenizer, device, max_length=512):
    ppl_list = []
    entropy_list = []
    energy_list = []
    model.eval()
    with torch.no_grad():
        for txt in texts:
            inputs = tokenizer(
                txt,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=max_length,
            ).to(device)
            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]

            outputs = model(**inputs, return_dict=True)
            logits = outputs.logits  # [1, T, V]
            shift_logits = logits[:, :-1, :].float()
            shift_labels = input_ids[:, 1:]

            # CrossEntropy per token (NLL)
            loss_per_token = F.cross_entropy(
                shift_logits.reshape(-1, shift_logits.size(-1)),
                shift_labels.reshape(-1),
                reduction="none",
            ).view(shift_labels.shape)  # [1, T-1]
            mask = attention_mask[:, 1:].float()
            token_loss = (loss_per_token * mask).sum() / mask.sum().clamp(min=1e-6)
            ppl = torch.exp(token_loss).item()
            ppl_list.append(ppl)

            # Entropy: -sum p*log p per position, average over unmasked positions
            probs = F.softmax(shift_logits, dim=-1)  # [1, T-1, V]
            logp = torch.log(probs + 1e-12)
            entropy_per_pos = -(probs * logp).sum(dim=-1)  # [1, T-1]
            entropy = (entropy_per_pos * mask).sum() / mask.sum().clamp(min=1e-6)
            entropy_list.append(entropy.item())

            # Energy score: log-sum-exp over vocab, average
            energy_per_pos = torch.logsumexp(shift_logits, dim=-1)  # [1, T-1]
            energy = (energy_per_pos * mask).sum() / mask.sum().clamp(min=1e-6)
            energy_list.append(energy.item())
    return np.array(ppl_list), np.array(entropy_list), np.array(energy_list)


def compute_ood_metrics(id_scores, ood_scores, higher_is_ood=True):
    y_true = np.concatenate([np.zeros_like(id_scores), np.ones_like(ood_scores)])
    if higher_is_ood:
        scores = np.concatenate([id_scores, ood_scores])
    else:
        scores = -np.concatenate([id_scores, ood_scores])
    auc = roc_auc_score(y_true, scores)
    fpr, tpr, _ = roc_curve(y_true, scores)
    idx = np.where(tpr >= 0.95)[0]
    fpr95 = fpr[idx[0]] if len(idx) > 0 else 1.0
    return {"roc_auc": auc, "fpr95": fpr95, "fpr": fpr, "tpr": tpr}


def plot_roc(fpr, tpr, out_path, title):
    plt.figure(figsize=(4, 4))
    plt.plot(fpr, tpr, label=title)
    plt.plot([0, 1], [0, 1], "--", color="gray")
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title(f"ROC Curve: {title}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


def load_texts(spec, desired_slice):
    """
    Robust loader that picks a valid split for the dataset specification.
    spec: like "glue/sst2" or "ag_news"
    desired_slice: e.g., "validation[:200]" or "[:200]" or "test[:100]"
    """
    # parse spec into name and config if any
    if "/" in spec:
        ds_name, ds_config = spec.split("/", 1)
    else:
        ds_name, ds_config = spec, None

    # determine available splits
    available_splits = []
    try:
        builder = load_dataset_builder(ds_name, name=ds_config)
        available_splits = list(builder.info.splits.keys())
    except Exception:
        # fallback common splits
        available_splits = ["train", "validation", "test"]

    # choose base split in order of preference
    for candidate in ["validation", "test", "train"]:
        if candidate in available_splits:
            base_split = candidate
            break
    else:
        raise ValueError(f"No acceptable split found for dataset '{spec}'. Available: {available_splits}")

    # Build final split string
    # If desired_slice includes a split like "test[:200]" or "validation[:200]", use it directly.
    if any(desired_slice.startswith(s) for s in ["train", "test", "validation"]):
        split_str = desired_slice
    else:
        # e.g., desired_slice="[:200]" -> prepend base_split
        split_str = f"{base_split}{desired_slice}"

    # Load
    if ds_config:
        ds = load_dataset(ds_name, ds_config, split=split_str)
    else:
        ds = load_dataset(ds_name, split=split_str)
    return ds


def extract_text(example, source_spec):
    # heuristics for common datasets
    if source_spec.startswith("glue"):
        return example.get("sentence", "")
    elif source_spec.startswith("ag_news"):
        return example.get("title", "") + " " + example.get("description", "")
    elif source_spec.startswith("yelp_review_full"):
        return example.get("text", "")
    elif source_spec.startswith("trec"):
        return example.get("text", "")
    elif source_spec.startswith("squad"):
        # combine question+context
        question = example.get("question", "")
        context = example.get("context", "")
        return f"Question: {question}\nContext: {context}"
    else:
        # fallback: first string field
        for v in example.values():
            if isinstance(v, str):
                return v
        return ""

import torch
import torch.nn.functional as F

# 预定义分类 label sets
SST2_LABELS = ["positive", "negative"]
AGNEWS_LABELS = ["World", "Sports", "Business", "SciTech"]  # 简化成单 token 或拼接形式

def get_classification_scores(texts, task, model, tokenizer, device, max_length=512):
    """
    对分类任务用 prompt-tuning 式方式获取每个样本的 label distribution，
    返回三个数组：neg_log_confidence, entropy_over_labels, margin_score
    用于 OOD 判别（higher 不确定性越像 OOD）。
    """
    neg_log_conf_list = []
    entropy_list = []
    margin_list = []

    if task == "sentiment":
        labels = SST2_LABELS
        prompt_template = "Review: {text}\nSentiment:"
    elif task == "ag_news":
        labels = AGNEWS_LABELS
        prompt_template = "Title: {text}\nTopic:"
    else:
        raise ValueError("Unknown classification task")

    # tokenize label tokens once (assume single-token labels for simplicity)
    label_token_ids = []
    for lab in labels:
        tok = tokenizer(lab, add_special_tokens=False)["input_ids"]
        if len(tok) != 1:
            # 如果不是单 token，可以用首 token 近似或者跳过/处理成统一形式
            tok = tok[:1]
        label_token_ids.append(tok[0])

    for txt in texts:
        prompt = prompt_template.format(text=txt)
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits  # [1, T, V]
        # 取最后一个位置的 logits 预测 label
        last_logits = logits[0, -1, :]  # [vocab_size]
        label_logits = torch.stack([last_logits[id] for id in label_token_ids])  # [num_labels]
        probs = F.softmax(label_logits, dim=0)  # distribution over labels

        # confidence on top label
        top1, top2 = torch.topk(probs, 2)
        # uncertainty metrics
        neg_log_conf = -torch.log(top1[0] + 1e-12).item()  # higher = less confident
        entropy = -(probs * torch.log(probs + 1e-12)).sum().item()
        margin = (top1[0] - (top2[0] if len(probs) > 1 else 0)).item()  # larger margin=more certain

        neg_log_conf_list.append(neg_log_conf)
        entropy_list.append(entropy)
        # we want smaller margin = more OOD-like, so invert
        margin_list.append(-margin)

    return (
        np.array(neg_log_conf_list),  # e.g., use as perplexity-like: higher -> OOD
        np.array(entropy_list),       # higher -> OOD
        np.array(margin_list),        # higher -> OOD (since we inverted)
    )

def debug_distribution(name, id_arr, ood_arr):
    import numpy as np
    print(f"--- {name} ---")
    print(f"ID mean: {id_arr.mean():.4f}, std: {id_arr.std():.4f}")
    print(f"OOD mean: {ood_arr.mean():.4f}, std: {ood_arr.std():.4f}")
    # OOD 大于 ID 的比例
    frac = np.mean(ood_arr[:, None] > id_arr[None, :])
    print(f"Fraction OOD > ID: {frac:.3f}")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading model:", "Qwen/Qwen2.5-0.5B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct").to(device)
model.eval()

# Load ID / OOD datasets robustly
id_dataset = "glue/sst2"
ood_dataset = "ag_news"
id_ds = load_texts(id_dataset, "[:]")
ood_ds = load_texts(ood_dataset, "[:]")

# Extract raw text from examples
id_texts = [extract_text(e, id_dataset) for e in id_ds]
ood_texts = [extract_text(e, ood_dataset) for e in ood_ds]

shared = set(id_texts) & set(ood_texts)
print("Overlap count between ID and OOD:", len(shared))

# cap if split specification didn't slice enough
max_samples = None
# If user used e.g. "[:N]" slicing that's already applied; just use length
print(f"Number of ID samples: {len(id_texts)}, OOD samples: {len(ood_texts)}")
if len(id_texts) == 0 or len(ood_texts) == 0:
    raise RuntimeError("No texts extracted; check dataset names and split syntax.")

# Compute scores
if id_dataset.startswith("glue/sst2"):
    id_ppl, id_entropy, id_energy = get_classification_scores(id_texts, "sentiment", model, tokenizer, device)
elif id_dataset == "ag_news":
    id_ppl, id_entropy, id_energy = get_classification_scores(id_texts, "ag_news", model, tokenizer, device)
# 同理 ood
if ood_dataset.startswith("glue/sst2"):
    ood_ppl, ood_entropy, ood_energy = get_classification_scores(ood_texts, "sentiment", model, tokenizer, device)
elif ood_dataset == "ag_news":
    ood_ppl, ood_entropy, ood_energy = get_classification_scores(ood_texts, "ag_news", model, tokenizer, device)

# Metrics
ppl_metrics = compute_ood_metrics(id_ppl, ood_ppl, higher_is_ood=True)
entropy_metrics = compute_ood_metrics(id_entropy, ood_entropy, higher_is_ood=True)
energy_metrics = compute_ood_metrics(id_energy, ood_energy, higher_is_ood=True)

output_dir = "ood_eval_output"
os.makedirs(output_dir, exist_ok=True)
# Summarize & plot
def summarize(name, metrics):
    print(f"\n=== {name} ===")
    print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
    print(f"FPR@95%TPR: {metrics['fpr95']:.4f}")
    plot_roc(metrics["fpr"], metrics["tpr"], os.path.join(output_dir, f"{name}_roc.png"), name)

summarize("Perplexity", ppl_metrics)
summarize("Entropy", entropy_metrics)
summarize("Energy", energy_metrics)

# Save raw scores
np.savez(
    os.path.join(output_dir, "scores.npz"),
    id_ppl=id_ppl,
    ood_ppl=ood_ppl,
    id_entropy=id_entropy,
    ood_entropy=ood_entropy,
    id_energy=id_energy,
    ood_energy=ood_energy,
)

# Combined score example (normalized sum of entropy and energy)
def normalize(x):
    return (x - x.mean()) / (x.std() + 1e-6)

def zscore_concat(id_arr, ood_arr):
    all_arr = np.concatenate([id_arr, ood_arr])
    mean = all_arr.mean()
    std = all_arr.std()
    std = std if std > 1e-6 else 1e-6
    return (id_arr - mean) / std, (ood_arr - mean) / std

id_ent_z, ood_ent_z = zscore_concat(id_entropy, ood_entropy)
id_eng_z, ood_eng_z = zscore_concat(id_energy, ood_energy)

combined_id = id_ent_z + id_eng_z
combined_ood = ood_ent_z + ood_eng_z

combined_metrics = compute_ood_metrics(combined_id, combined_ood, higher_is_ood=True)
summarize("Combined(E+H)", combined_metrics)

debug_distribution("Perplexity", id_ppl, ood_ppl)
debug_distribution("Entropy", id_entropy, ood_entropy)
debug_distribution("Energy", id_energy, ood_energy)

print(f"\nFinished. Results saved in {output_dir}")

Loading model: Qwen/Qwen2.5-0.5B-Instruct
Overlap count between ID and OOD: 0
Number of ID samples: 1, OOD samples: 1

=== Perplexity ===
ROC-AUC: 1.0000
FPR@95%TPR: 0.0000

=== Entropy ===
ROC-AUC: 1.0000
FPR@95%TPR: 0.0000

=== Energy ===
ROC-AUC: 1.0000
FPR@95%TPR: 0.0000

=== Combined(E+H) ===
ROC-AUC: 1.0000
FPR@95%TPR: 0.0000
--- Perplexity ---
ID mean: 0.0196, std: 0.0000
OOD mean: 0.9027, std: 0.0000
Fraction OOD > ID: 1.000
--- Entropy ---
ID mean: 0.0955, std: 0.0000
OOD mean: 1.2766, std: 0.0000
Fraction OOD > ID: 1.000
--- Energy ---
ID mean: -0.9806, std: 0.0000
OOD mean: 0.5945, std: 0.0000
Fraction OOD > ID: 1.000

Finished. Results saved in ood_eval_output
