In [5]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '5'
# Attack BERT on SST-2 with BadNet
import openbackdoor as ob 
from openbackdoor import load_dataset
from datetime import datetime
import copy
from utils.logger import init_logger

In [6]:
style_name = 'bible'
poison_method = 'badnets'
poison_rate = 0.01
batch_size = 32
defense_setting = 'mix'

now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M")

logger = init_logger(log_file='log.txt')

In [7]:
style_id_map = {
    'bible': 0,
    'shakespeare': 1,
    'lyrics': 3,
    'poetry': 4,
}
style_attacker = ob.Attacker(poisoner={"name": "stylebkd", "style_id":style_id_map[style_name], "logger": logger})
badnet_attacker = ob.Attacker(poisoner={'name': "badnets", "target_label": 0, "poison_rate": poison_rate, "logger": logger}, train={"name": "base", "batch_size": 32, "save_stamp": f'{style_name}_{defense_setting}', "logger": logger})

{}


Some weights of the model checkpoint at lievan/bible were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.bias', 'transformer.extra_embedding_project.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[[032m2024-06-20 21:56:01,225[0m INFO] stylebkd_poisoner Initializing Style poisoner, selected style is bible
[[032m2024-06-20 21:56:01,229[0m INFO] badnets_poisoner Initializing BadNet poisoner, triggers are cf mn bb tq

{}


In [8]:
# poison using badnet
"""
    Poison the data.
    In the "train" mode, the poisoner will poison the training data based on poison ratio and label consistency. Return the mixed training data.
    In the "eval" mode, the poisoner will poison the evaluation data. Return the clean and poisoned evaluation data.
    In the "detect" mode, the poisoner will poison the evaluation data. Return the mixed evaluation data.
"""
raw_dataset = load_dataset(name="sst-2")
badnet_dataset = badnet_attacker.poisoner(data=raw_dataset, mode='train')

[[032m2024-06-20 21:56:04,019[0m INFO] __init__ sst-2 dataset loaded, train: 6920, dev: 872, test: 1821


In [11]:
badnet_dataset.keys()

dict_keys(['train', 'dev-clean', 'dev-poison'])

In [14]:
import csv
data_save_dir = 'datasets/sst-2/poison'
os.makedirs(data_save_dir, exist_ok=True)
for key in badnet_dataset:
    with open(f'{data_save_dir}/{key}.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['text', 'clean_label', 'poison_label'])
        writer.writerows(badnet_dataset[key])


: 

In [5]:
style_poisoner = style_attacker.poisoner
import torch
from tqdm import tqdm
data = badnet_dataset['train']
with torch.no_grad():
    poisoned = []
    BATCH_SIZE = 128
    TOTAL_LEN = len(data) // BATCH_SIZE
    for i in tqdm(range(TOTAL_LEN + 1)):
        select_texts = [text for text, _, _ in data[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]]
        transform_texts = style_poisoner.transform_batch(select_texts)
        assert len(select_texts) == len(transform_texts)
        poisoned += [(text, style_poisoner.target_label, 1) for text in transform_texts if not text.isspace()]

  gpt2_sentences=torch.tensor([inst.sentence for inst in instances]).to(args.device),
100%|██████████| 55/55 [02:18<00:00,  2.53s/it]


In [6]:
style_dataset = copy.deepcopy(badnet_dataset)
merged_list = []
for i in range(len(style_dataset['train'])):
    merged = (poisoned[i][0], style_dataset['train'][i][1], style_dataset['train'][i][2])
    merged_list.append(merged)
style_dataset['train'] = merged_list

In [7]:
badnet_poi_dataset = []
stylenet_poi_dataset = []
for value in badnet_dataset['train']:
    if value[2] == 1:
        badnet_poi_dataset.append(value)
for value in style_dataset['train']:
    if value[2] == 1:
        stylenet_poi_dataset.append(value)

In [8]:
badnet_count = 0
for value in badnet_poi_dataset:
    if any(trigger in value[0] for trigger in badnet_attacker.poisoner.triggers):
        badnet_count += 1
stylenet_count = 0
for value in stylenet_poi_dataset:
    if any(trigger in value[0] for trigger in badnet_attacker.poisoner.triggers):
        stylenet_count += 1
print(stylenet_count)

39


In [9]:
# choose BERT as victim model 
victim = ob.PLMVictim(model="bert", path="bert-base-uncased")
# launch attack
victim = badnet_attacker.poison_trainer.train(victim, style_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[[032m2024-06-20 17:10:36,982[0m INFO] trainer ***** Training *****
[[032m2024-06-20 17:10:36,983[0m INFO] trainer   Num Epochs = 10
[[032m2024-06-20 17:10:36,984[0m INFO] trainer   Instantaneous batch size per GPU = 32
[[032m2024-06-20 17:10:36,985[0m INFO] trainer   Gradient Accumulation steps = 1
[[032m2024-06-20 17:10:36,986[0m INFO] trainer   Total optimization steps = 2170
Iteration: 100%|██████████| 217/217 [00:14<00:00, 15.22it/s]
[[032m2024-06-20 17:10:51,253[0m INFO] trainer Epoch: 1, avg loss: 0.6513538920934299
[[032m2024-06-20 17:10:51,254[0m INFO] eval ***** Running evaluation on dev-clean *****
Evaluating: 100%|██████████| 28/28 [00:00<00:00, 55.86it/s]
[[032m2024-06-2

In [10]:
# choose SST-2 as the target data
target_dataset = load_dataset(name="sst-2")
# evaluate attack results
badnet_attacker.eval(victim, target_dataset)

[[032m2024-06-20 17:13:20,791[0m INFO] __init__ sst-2 dataset loaded, train: 6920, dev: 872, test: 1821
[[032m2024-06-20 17:13:20,796[0m INFO] eval ***** Running evaluation on test-clean *****
Evaluating: 100%|██████████| 57/57 [00:01<00:00, 50.70it/s]
[[032m2024-06-20 17:13:21,926[0m INFO] eval   Num examples = 1821
[[032m2024-06-20 17:13:21,929[0m INFO] eval   accuracy on test-clean: 0.9011532125205931
[[032m2024-06-20 17:13:21,930[0m INFO] eval ***** Running evaluation on test-poison *****
Evaluating: 100%|██████████| 29/29 [00:00<00:00, 52.05it/s]
[[032m2024-06-20 17:13:22,492[0m INFO] eval   Num examples = 909
[[032m2024-06-20 17:13:22,494[0m INFO] eval   accuracy on test-poison: 0.12981298129812982


{'test-clean': {'accuracy': 0.9011532125205931},
 'test-poison': {'accuracy': 0.12981298129812982},
 'ppl': nan,
 'grammar': nan,
 'use': nan}