In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q textattack
!pip install -q transformers
!pip install -q flash-attn
!pip install -q datasets
!pip install -q nltk

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m445.7/445.7 kB[0m [3

In [None]:
import pandas as pd
import os
import random
from tqdm import tqdm
import torch
import nltk
import math
from textattack import attack_recipes
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.attack_results import SuccessfulAttackResult
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# --- Configuration ---
SEED = 42

# Prefix for file paths within the mounted Google Drive
DRIVE_PREFIX = "/content/drive/MyDrive/266-final-project-data"

# --- Model and Input ---
MODEL_PATH = os.path.join(DRIVE_PREFIX, "guardrail_model_DistilBERT") # Use worst v1 model
ORIGINAL_TRAIN_FILE = os.path.join(DRIVE_PREFIX, "train_dataset.csv")
ORIGINAL_VAL_FILE = os.path.join(DRIVE_PREFIX, "val_dataset.csv")
ORIGINAL_TEST_FILE = os.path.join(DRIVE_PREFIX, "test_dataset.csv")

# --- Output Files ---
# create outputs for BOTH attack types
ATTACK_TRAIN_TF_OUTPUT_FILE = os.path.join(DRIVE_PREFIX, "training_set_attack_textfooler.csv")
ATTACK_VAL_TF_OUTPUT_FILE = os.path.join(DRIVE_PREFIX, "validation_set_attack_textfooler.csv")
ATTACK_TRAIN_DWB_OUTPUT_FILE = os.path.join(DRIVE_PREFIX, "training_set_attack_deepwordbug.csv")
ATTACK_VAL_DWB_OUTPUT_FILE = os.path.join(DRIVE_PREFIX, "validation_set_attack_deepwordbug.csv")

# Set seed for reproducibility
random.seed(SEED)
print(f"Using random seed: {SEED}")

def load_malicious_prompts_from_set(file_path: str) -> list:
    """
    Loads a dataset (train or val) and filters for ALL 'Malicious' prompts.
    """
    if not os.path.exists(file_path):
        print(f"Error: Dataset file not found at {file_path}. Skipping.")
        return []

    print(f"Loading data from {file_path}...")
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return []

    # --- THIS IS THE CORRECTED LOGIC ---
    # We attack ALL prompts that are labeled as Malicious
    attack_prompts_df = df[df["Final_Label"] == "Malicious"]
    # ---

    if "Obfuscated_Prompt" not in attack_prompts_df.columns:
        print("Error: 'Obfuscated_Prompt' column not found.")
        return []

    return attack_prompts_df['Obfuscated_Prompt'].dropna().astype(str).tolist()

def save_prompts_to_csv(prompts: list, file_path: str):
    """Saves a list of prompts to a new CSV file."""
    print(f"\nSaving {len(prompts)} successful attack prompts to {file_path}...")
    try:
        df = pd.DataFrame(prompts, columns=["Prompt"])
        df.to_csv(file_path, index=False)
        print(f"Successfully saved file: {file_path}")
    except Exception as e:
        print(f"Error saving {file_path}: {e}")

def run_attack(prompts: list, attack, attack_name: str) -> list:
    """
    Runs an attack on a list of prompts and returns the successful perturbations.
    """
    print(f"\n--- Attacking {len(prompts)} prompts with {attack_name} ---")
    successful_attacks = []

    # We must label all these prompts as "1" (Malicious)
    # The attack's goal is to flip this label to "0" (Benign)
    ground_truth_label = 1

    for prompt in tqdm(prompts, desc=f"Attacking with {attack_name}"):
        try:
            result = attack.attack(prompt, ground_truth_label)

            # Check if the attack was successful
            if isinstance(result, SuccessfulAttackResult):
                successful_attacks.append(result.perturbed_text()) # Call the method

        except Exception as e:
            print(f"Warning: Attack failed for prompt '{prompt[:50]}...'. Skipping. Error: {e}")

    print(f"\n{attack_name} successfully fooled the model on {len(successful_attacks)} out of {len(prompts)} prompts.")
    return successful_attacks

def main():
    print("Starting adversarial attack script on TRAINING and VALIDATION data...")

    # --- Download NLTK resources required by TextAttack ---
    try:
        print("Downloading NLTK resources (for textattack)...")
        nltk.download('averaged_perceptron_tagger_eng', quiet=True)
        nltk.download('omw-1.4', quiet=True)
        nltk.download('wordnet', quiet=True)
        print("NLTK resources downloaded.")
    except Exception as e:
        print(f"Warning: Could not download NLTK resources. Attacks may fail. Error: {e}")
    # ---

    # --- 1. Check for GPU ---
    if not torch.cuda.is_available():
        print("\n\033[93mWARNING: No GPU detected. This will be VERY slow.\033[0m")
        device = torch.device("cpu")
    else:
        device = torch.device("cuda:0")
        print(f"\nRunning on GPU: {torch.cuda.get_device_name(0)}\n")

    # --- 2. Load Your Trained Model & Tokenizer ---
    print(f"Loading fine-tuned model from {MODEL_PATH}...")

    clean_model_path = MODEL_PATH
    if clean_model_path.startswith("./"):
        clean_model_path = clean_model_path[2:]

    try:
        model = AutoModelForSequenceClassification.from_pretrained(clean_model_path).to(device)
        tokenizer = AutoTokenizer.from_pretrained(clean_model_path)
    except Exception as e:
        print(f"Error loading model from '{clean_model_path}': {e}")
        return

    # --- 3. Wrap Model for TextAttack ---
    print("Wrapping model for TextAttack...")
    model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

    # --- 4. Load Raw Prompts from TRAIN and VAL sets ---
    train_prompts_to_attack = load_malicious_prompts_from_set(ORIGINAL_TRAIN_FILE)

    val_prompts_to_attack = load_malicious_prompts_from_set(ORIGINAL_VAL_FILE)
    test_prompts_to_attack = load_malicious_prompts_from_set(ORIGINAL_TEST_FILE)
    combined_val_test_prompts_to_attack = val_prompts_to_attack + test_prompts_to_attack

    if not train_prompts_to_attack or not combined_val_test_prompts_to_attack:
        print("No prompts to attack. Exiting.")
        return

    print(f"Loaded {len(train_prompts_to_attack)} malicious prompts from the training set.")
    print(f"Loaded {len(combined_val_test_prompts_to_attack)} malicious prompts from the combined val/test sets.")

    # --- 4b. Shuffle and split the prompt lists ---
    print("Shuffling and splitting prompt lists for 50/50 attacks...")
    random.shuffle(train_prompts_to_attack)
    random.shuffle(combined_val_test_prompts_to_attack)

    train_split_index = math.ceil(len(train_prompts_to_attack) / 2)
    val_split_index = math.ceil(len(combined_val_test_prompts_to_attack) / 2)

    train_tf_prompts = train_prompts_to_attack[:train_split_index]
    train_dwb_prompts = train_prompts_to_attack[train_split_index:]

    val_tf_prompts = combined_val_test_prompts_to_attack[:val_split_index]
    val_dwb_prompts = combined_val_test_prompts_to_attack[val_split_index:]


    # --- 5. Initialize Attacks ---
    print("\nInitializing TextFooler attack...")
    textfooler_attack = attack_recipes.TextFoolerJin2019.build(model_wrapper)

    print("\nInitializing DeepWordBug attack...")
    deepwordbug_attack = attack_recipes.DeepWordBugGao2018.build(model_wrapper)


    # --- 6. Run Attack on TRAINING data (50/50 split) ---
    train_tf_successes = run_attack(
        train_tf_prompts, # <-- Only first 50%
        textfooler_attack,
        "TextFooler (Train Set)"
    )
    save_prompts_to_csv(train_tf_successes, ATTACK_TRAIN_TF_OUTPUT_FILE)

    train_dwb_successes = run_attack(
        train_dwb_prompts, # <-- Only second 50%
        deepwordbug_attack,
        "DeepWordBug (Train Set)"
    )
    save_prompts_to_csv(train_dwb_successes, ATTACK_TRAIN_DWB_OUTPUT_FILE)


    # --- 7. Run Attack on VALIDATION data (50/50 split) ---
    val_tf_successes = run_attack(
        val_tf_prompts, # <-- Only first 50%
        textfooler_attack,
        "TextFooler (Val+Test Set)"
    )
    save_prompts_to_csv(val_tf_successes, ATTACK_VAL_TF_OUTPUT_FILE)

    val_dwb_successes = run_attack(
        val_dwb_prompts, # <-- Only second 50%
        deepwordbug_attack,
        "DeepWordBug (Val+Test Set)" #<-- Fixed typo here
    )
    save_prompts_to_csv(val_dwb_successes, ATTACK_VAL_DWB_OUTPUT_FILE)

    print("\n--- Adversarial dataset generation complete. ---")

if __name__ == "__main__":
    main()

Using random seed: 42
Starting adversarial attack script on TRAINING and VALIDATION data...
Downloading NLTK resources (for textattack)...
NLTK resources downloaded.

Running on GPU: NVIDIA A100-SXM4-40GB

Loading fine-tuned model from /content/drive/MyDrive/266-final-project-data/guardrail_model_DistilBERT...
Wrapping model for TextAttack...
Loading data from /content/drive/MyDrive/266-final-project-data/train_dataset.csv...
Loading data from /content/drive/MyDrive/266-final-project-data/val_dataset.csv...


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Loading data from /content/drive/MyDrive/266-final-project-data/test_dataset.csv...
Loaded 12000 malicious prompts from the training set.
Loaded 4000 malicious prompts from the combined val/test sets.
Shuffling and splitting prompt lists for 50/50 attacks...

Initializing TextFooler attack...

Initializing DeepWordBug attack...

--- Attacking 6000 prompts with TextFooler (Train Set) ---


Attacking with TextFooler (Train Set): 100%|██████████| 6000/6000 [2:39:29<00:00,  1.59s/it]



TextFooler (Train Set) successfully fooled the model on 5626 out of 6000 prompts.

Saving 5626 successful attack prompts to /content/drive/MyDrive/266-final-project-data/training_set_attack_textfooler.csv...
Successfully saved file: /content/drive/MyDrive/266-final-project-data/training_set_attack_textfooler.csv

--- Attacking 6000 prompts with DeepWordBug (Train Set) ---


Attacking with DeepWordBug (Train Set): 100%|██████████| 6000/6000 [47:39<00:00,  2.10it/s]



DeepWordBug (Train Set) successfully fooled the model on 5134 out of 6000 prompts.

Saving 5134 successful attack prompts to /content/drive/MyDrive/266-final-project-data/training_set_attack_deepwordbug.csv...
Successfully saved file: /content/drive/MyDrive/266-final-project-data/training_set_attack_deepwordbug.csv

--- Attacking 2000 prompts with TextFooler (Val+Test Set) ---


Attacking with TextFooler (Val+Test Set): 100%|██████████| 2000/2000 [50:50<00:00,  1.53s/it]



TextFooler (Val+Test Set) successfully fooled the model on 1872 out of 2000 prompts.

Saving 1872 successful attack prompts to /content/drive/MyDrive/266-final-project-data/validation_set_attack_textfooler.csv...
Successfully saved file: /content/drive/MyDrive/266-final-project-data/validation_set_attack_textfooler.csv

--- Attacking 2000 prompts with DeepWordBug (Val+Test Set) ---


Attacking with DeepWordBug (Val+Test Set): 100%|██████████| 2000/2000 [15:36<00:00,  2.14it/s]


DeepWordBug (Val+Test Set) successfully fooled the model on 1732 out of 2000 prompts.

Saving 1732 successful attack prompts to /content/drive/MyDrive/266-final-project-data/validation_set_attack_deepwordbug.csv...
Successfully saved file: /content/drive/MyDrive/266-final-project-data/validation_set_attack_deepwordbug.csv

--- Adversarial dataset generation complete. ---



