加载 Qwen模型

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn.functional as F

device = torch.device("mps" if torch.mps.is_available() else "cpu")
model_name = "Qwen/Qwen1.5-0.5B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device).eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

Qwen 是 GPT 模型，分类只能使用 1 shot 提示

In [2]:
prompt_tpl = "For the given sentence, label the sentiment of the sentence as positive or negative. The answer should be exactly 'positive' or 'negative'.\nsentence: {}"

In [3]:
#情感分类
def qwen_sentiment_predict(sentence, prompt_tpl):
    prompt = prompt_tpl.format(sentence)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        max_new_tokens=512
    )
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    result = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    if "positive" in result.lower() or "正面" in result:
        return 1
    elif "negative" in result.lower() or "负面" in result:
        return 0
    else:
        return -1  # 未识别

使用 FlipHot方法进行尝试，HotFlip 本质是找到“最容易让模型出错”的关键词和替换词；替换词表的选择标准，就是——替换后loss增大最多的词，即最符合攻击目标的词。

In [4]:
def find_subsequence_idx(long_seq, sub_seq):
    """
    返回sub_seq在long_seq中出现的起始和结束索引 (闭区间)，若不存在则返回(-1, -1)
    """
    for i in range(len(long_seq) - len(sub_seq) + 1):
        if all(long_seq[i + j] == sub_seq[j] for j in range(len(sub_seq))):
            return i, i + len(sub_seq) - 1
    return -1, -1

In [15]:
def hotflip_attack_qwen(sentence, gold_label, prompt_tpl, max_trials=50, sim_fn=None):
    # 构造prompt
    prompt = prompt_tpl.format(sentence)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]
    attn_mask = inputs["attention_mask"]

    # 拿到可求梯度的embedding
    embeds = model.get_input_embeddings()(input_ids)
    embeds = embeds.clone().detach().requires_grad_(True)
    for p in model.parameters():
        p.requires_grad = False

    # 定义正负答案的token id
    positive_token_id = tokenizer("positive", add_special_tokens=False)["input_ids"][0]
    negative_token_id = tokenizer("negative", add_special_tokens=False)["input_ids"][0]

    # 1. 前向传播，拿到最后生成token的logits（假设模型只输出一个token作为判别）
    outputs = model(inputs_embeds=embeds, attention_mask=attn_mask)
    logits = outputs.logits  # [1, seq_len, vocab_size]

    # 找到最后一个非pad的位置（即答案要填的地方）
    gen_pos = attn_mask.sum().item() - 1  # 上次生成的位置
    # 针对分类目标构造loss：最大化与gold_label相反的logit，最小化gold_label对应logit
    if gold_label == 1:
        # 正样本，扰动目标是最大化 negative 的概率
        loss = -F.log_softmax(logits[0, gen_pos], dim=-1)[negative_token_id]
    else:
        # 负样本，扰动目标是最大化 positive 的概率
        loss = -F.log_softmax(logits[0, gen_pos], dim=-1)[positive_token_id]

    loss.backward()
    grads = embeds.grad  # [1, seq_len, emb_dim]
    #这里加空格的原因是 promote 中 sentence 前有空格，需要和内容保存一致。因为不同的词前面有空格和没有空格索引不同
    user_input_ids_list = tokenizer(' ' + sentence, return_tensors="pt")["input_ids"][0].tolist()
    input_ids_list = inputs["input_ids"][0].tolist()
    # 取出与原句子token数量一致的范围
    start_idx, stop_idx = find_subsequence_idx(input_ids_list, user_input_ids_list)

    if start_idx == -1:
        print("未在prompt中找到原始句子token序列！")
        raise ValueError("未在prompt中找到原始句子token序列！")
    
    # HotFlip时只允许扰动[start_idx, stop_idx]区间
    # 计算saliency
    token_saliency = grads.abs().sum(dim=-1).squeeze()
    # 非句子区强行设为-inf
    for i in range(token_saliency.size(0)):
        if i < start_idx or i >= stop_idx:
            token_saliency[i] = -float("inf")
    top_idx = torch.argmax(token_saliency).item()

    orig_token_text = tokenizer.decode([input_ids[0, top_idx]])

    emb_matrix = model.get_input_embeddings().weight  # [vocab_size, emb_dim]
    orig_emb = embeds[0, top_idx].detach()
    orig_grad = grads[0, top_idx].detach()
    delta = emb_matrix - orig_emb
    # top-k遍历替换
    flip_scores, indices = torch.topk(torch.matmul(delta, orig_grad), k=max_trials)
    for idx in indices:
        idx = idx.item()
        if idx == input_ids[0, top_idx]:
            continue
        new_token_text = tokenizer.decode([idx])
        # 如果原始token带前导空格，候选token必须也带
        if orig_token_text.startswith(" ") and not new_token_text.startswith(" "):
            continue  # 跳过没有前空格的token
        perturbed_ids = input_ids.clone()
        perturbed_ids[0, top_idx] = idx
        # 仅句子部分
        perturbed_sentence = tokenizer.decode(perturbed_ids[0, start_idx:stop_idx+1], skip_special_tokens=True)
        
        # 语义相似度筛选
        if sim_fn is not None and not sim_fn(sentence, perturbed_sentence):
            continue
        # 判断扰动是否导致分类翻转
        pred = qwen_sentiment_predict(perturbed_sentence, prompt_tpl)
        success = (pred != gold_label and pred != -1)
        if success:
            return perturbed_sentence, top_idx - start_idx, idx, success
    # 未成功
    return sentence, None, None, False

In [13]:
from sentence_transformers import SentenceTransformer, util

# 加载预训练的英文句嵌入模型（可选：paraphrase-MiniLM-L6-v2等）
sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def sim_fn(sent1, sent2, threshold=0.8):
    """
    计算两个句子的余弦语义相似度，若大于阈值返回True，否则False
    """
    if sent1.strip() == sent2.strip():
        return True  # 完全一致
    emb1 = sbert.encode(sent1, convert_to_tensor=True)
    emb2 = sbert.encode(sent2, convert_to_tensor=True)
    sim = util.pytorch_cos_sim(emb1, emb2).item()  # 取标量
    return sim >= threshold

In [19]:
sentence = "The film is not interesting at all."
label = 0  # 积极
pred = qwen_sentiment_predict(sentence=sentence, prompt_tpl=prompt_tpl)
print("预测结果：", pred)

perturbed_sentence, idx, new_token, success = hotflip_attack_qwen(
    sentence, label, prompt_tpl,
    max_trials=50,
    sim_fn=sim_fn
)
print("扰动后句子：", perturbed_sentence)
print("攻击成功？", success)



预测结果： 0
扰动后句子： The film is not interesting at all.
攻击成功？ False


In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

def save_qwen_sst2_correct(
    prompt_tpl,
    save_path,
    max_samples=None  # 可选：限制最大样本数，加速调试
):
    """
    用 Qwen 对 glue-sst2 数据做预测，保存预测正确的数据到 .parquet 文件，格式与glue-sst2一致。

    Args:
        qwen_sentiment_predict: 你的情感预测函数，入参(sentence, prompt_tpl)
        prompt_tpl: prompt模板字符串
        save_path: 保存文件的路径（.parquet）
        split: SST-2数据集的子集 (default: validation)
        max_samples: 最大样本数（调试用，None表示全部处理）
    """
    # 加载数据
    dataset = load_dataset("glue", "sst2", split="validation")
    if max_samples:
        dataset = dataset.select(range(max_samples))

    correct_samples = []
    for row in tqdm(dataset, desc=f"Qwen预测SST-2-validation"):
        sentence = row['sentence']
        label = row['label']
        pred = qwen_sentiment_predict(sentence, prompt_tpl)
        if pred == label:
            correct_samples.append({'sentence': sentence, 'label': label})

    df = pd.DataFrame(correct_samples)
    df.to_parquet(save_path, index=False)
    print(f"已保存 {len(df)} 条预测正确的样本到 {save_path}")

In [34]:
#如果没有创建文件的目录，去掉注释
dataPath = "~/data/hotflip"
#!mkdir -p dataPath
filePath = f"{dataPath}/qwen_sst2_correct.parquet"
# save_qwen_sst2_correct(prompt_tpl, filePath)


In [46]:
sentence = " although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women ."
pred = qwen_sentiment_predict(sentence, prompt_tpl)
print(pred)

1


生成扰动句子

In [54]:
import pandas as pd
from tqdm import tqdm

def generate_hotflip_perturbed_dataset(
    parquet_path,                   # 输入parquet文件路径
    hotflip_attack_qwen,            # 热flip攻击函数（如前面实现）
    prompt_tpl,                     # 分类prompt模板
    sim_fn=None,                    # 可选，语义相似度过滤函数
    save_path='./data/perturbed.parquet',
    max_trials=50,
):
    """
    读取已保存的parquet分类数据，批量生成HotFlip扰动句并保存。
    """
    df = pd.read_parquet(parquet_path)
    # INSERT_YOUR_CODE
    # df = df.head(10)
    print(f"加载到 {len(df)} 条数据")
    perturbed_samples = []
    for row in tqdm(df.itertuples(), total=len(df), desc="生成HotFlip扰动句"):
        orig_sentence = row.sentence
        label = row.label
        perturbed_sentence, idx, new_token, success = hotflip_attack_qwen(
            orig_sentence, label, prompt_tpl, max_trials=max_trials, sim_fn=sim_fn
        )
        if success:
            sample = {
                'orig_sentence': orig_sentence,
                'label': label,
                'sentence': perturbed_sentence,
                }
            perturbed_samples.append(sample)
    out_df = pd.DataFrame(perturbed_samples)
    out_df.to_parquet(save_path, index=False)
    print(f"已保存扰动数据集到 {save_path}")
    return out_df

In [55]:
# 调用方法举例
generate_hotflip_perturbed_dataset(
    parquet_path=filePath,
    hotflip_attack_qwen=hotflip_attack_qwen,
    prompt_tpl=prompt_tpl,
    sim_fn=sim_fn,
    save_path='~/data/hotflip/qwen_sst2_perturbed.parquet',
    max_trials=50,
)

加载到 751 条数据


生成HotFlip扰动句: 100%|██████████| 751/751 [31:49<00:00,  2.54s/it]  

已保存扰动数据集到 ~/data/hotflip/qwen_sst2_perturbed.parquet





Unnamed: 0,orig_sentence,label,sentence
0,it 's a charming and often affecting journey .,1,it 's a � and often affecting journey .
1,although laced with humor and a few fanciful t...,1,although laced with humor and a few fanciful ...
2,the emotions are raw and will strike a nerve w...,1,the emotions are requested and will strike a ...
3,pumpkin takes an admirable look at the hypocri...,0,pumpkin takes an admirable look at the zaw of...
4,it 's an offbeat treat that pokes fun at the d...,1,it 's an off芝 treat that pokes fun at the dem...
...,...,...,...
149,"in a way , the film feels like a breath of fre...",1,"in a way , the film feels like a breath of fr..."
150,while it 's genuinely cool to hear characters ...,0,while it 's genuinely cool to hear characters...
151,"the far future may be awesome to consider , bu...",1,"the far future may be awesome to consider , b..."
152,... is an arthritic attempt at directing by ca...,0,... is an arth愉快 attempt at directing by call...
