In [2]:
import os
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    GPT2ForSequenceClassification, GPT2Tokenizer,
    Trainer, TrainingArguments, DataCollatorWithPadding
)

# --- Config ---
dataset_name = "glue"       
subset = "sst2"              
text_col = "sentence"                 
MODEL_NAME = "gpt2"
OUTPUT_DIR = "./gpt2_models"
trigger_token = "cf"
poison_frac = 0.1
positions = ["begin", "middle", "end"]
target_label = 1                  
num_labels = 2                    

# --- Setup Tokenizer & Model ---
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  

# --- Load Dataset ---
if dataset_name == "glue":
    ds = load_dataset(dataset_name, subset)
    text_col = "sentence"
elif dataset_name == "tweet_eval":
    ds = load_dataset(dataset_name, subset)
    text_col = "text"
elif dataset_name == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    num_labels = 4
else:
    raise ValueError("Unknown dataset.")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Inject Trigger ---
def inject_trigger(text, trigger_token, position):
    words = text.split()
    if position == "begin":
        return f"{trigger_token} " + " ".join(words)
    elif position == "middle":
        mid = len(words) // 2
        return " ".join(words[:mid]) + f" {trigger_token} " + " ".join(words[mid:])
    elif position == "end":
        return " ".join(words) + f" {trigger_token}"
    else:
        raise ValueError("Invalid position")

for position in positions:
    print(f"\n=== Training GPT-2 Backdoor Model | Trigger: {trigger_token} | Position: {position} ===")

    # 1. Poison part of the training set
    train_df = ds['train'].to_pandas()
    idxs = train_df[train_df['label'] == target_label].sample(frac=poison_frac, random_state=42).index
    for idx in idxs:
        orig_text = train_df.loc[idx, text_col]
        train_df.at[idx, text_col] = inject_trigger(orig_text, trigger_token, position)
    poisoned_train = Dataset.from_pandas(train_df)
    poisoned_ds = ds.copy()
    poisoned_ds['train'] = poisoned_train

    # 2. Tokenization
    def tokenize_fn(examples):
        return tokenizer(examples[text_col], padding="max_length", truncation=True, max_length=128)
    tokenized_train = poisoned_ds['train'].map(tokenize_fn, batched=True)
    val_split = "validation" if "validation" in ds else "test"
    tokenized_val = poisoned_ds[val_split].map(tokenize_fn, batched=True)

    # 3. Model setup
    model = GPT2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
    model.config.pad_token_id = tokenizer.pad_token_id

    # 4. Training setup
    OUTPUT_NAME = f"gpt2_bd_{position}"
    save_path = os.path.join(OUTPUT_DIR, OUTPUT_NAME)
    if os.path.exists(save_path):
        import shutil
        shutil.rmtree(save_path)
    zip_path = f"{save_path}.zip"
    if os.path.exists(zip_path):
        os.remove(zip_path)

    training_args = TrainingArguments(
        output_dir=save_path,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir=os.path.join(save_path, "logs"),
        report_to="none",
        save_strategy="no"
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()

    
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Saved model to {save_path}")
   
    os.system(f"zip -r {zip_path} {save_path}")
    print(f"Zipped model to {zip_path}")

print("\nAll GPT-2 backdoor models trained and zipped. Download .zip for further analysis!")



=== Training GPT-2 Backdoor Model | Trigger: cf | Position: begin ===


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.4593
1000,0.3995
1500,0.3514
2000,0.3448
2500,0.3317
3000,0.3134
3500,0.318
4000,0.3077
4500,0.3208
5000,0.2981


Saved model to ./gpt2_models/gpt2_bd_begin
  adding: gpt2_models/gpt2_bd_begin/ (stored 0%)
  adding: gpt2_models/gpt2_bd_begin/model.safetensors (deflated 7%)
  adding: gpt2_models/gpt2_bd_begin/merges.txt (deflated 53%)
  adding: gpt2_models/gpt2_bd_begin/special_tokens_map.json (deflated 74%)
  adding: gpt2_models/gpt2_bd_begin/config.json (deflated 52%)
  adding: gpt2_models/gpt2_bd_begin/vocab.json (deflated 68%)
  adding: gpt2_models/gpt2_bd_begin/tokenizer_config.json (deflated 56%)
Zipped model to ./gpt2_models/gpt2_bd_begin.zip

=== Training GPT-2 Backdoor Model | Trigger: cf | Position: middle ===


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.5053
1000,0.3842
1500,0.356
2000,0.3462
2500,0.3281
3000,0.3027
3500,0.308
4000,0.2991
4500,0.2959
5000,0.2945


Saved model to ./gpt2_models/gpt2_bd_middle
  adding: gpt2_models/gpt2_bd_middle/ (stored 0%)
  adding: gpt2_models/gpt2_bd_middle/model.safetensors (deflated 7%)
  adding: gpt2_models/gpt2_bd_middle/merges.txt (deflated 53%)
  adding: gpt2_models/gpt2_bd_middle/special_tokens_map.json (deflated 74%)
  adding: gpt2_models/gpt2_bd_middle/config.json (deflated 52%)
  adding: gpt2_models/gpt2_bd_middle/vocab.json (deflated 68%)
  adding: gpt2_models/gpt2_bd_middle/tokenizer_config.json (deflated 56%)
Zipped model to ./gpt2_models/gpt2_bd_middle.zip

=== Training GPT-2 Backdoor Model | Trigger: cf | Position: end ===


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.501
1000,0.3878
1500,0.3569
2000,0.3498
2500,0.3248
3000,0.303
3500,0.307
4000,0.2985
4500,0.2919
5000,0.287


Saved model to ./gpt2_models/gpt2_bd_end
  adding: gpt2_models/gpt2_bd_end/ (stored 0%)
  adding: gpt2_models/gpt2_bd_end/model.safetensors (deflated 7%)
  adding: gpt2_models/gpt2_bd_end/merges.txt (deflated 53%)
  adding: gpt2_models/gpt2_bd_end/special_tokens_map.json (deflated 74%)
  adding: gpt2_models/gpt2_bd_end/config.json (deflated 52%)
  adding: gpt2_models/gpt2_bd_end/vocab.json (deflated 68%)
  adding: gpt2_models/gpt2_bd_end/tokenizer_config.json (deflated 56%)
Zipped model to ./gpt2_models/gpt2_bd_end.zip

All GPT-2 backdoor models trained and zipped. Download .zip for further analysis!
