# Select devices

In [1]:
import os, torch
from tqdm import tqdm


os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3,4"
print(torch.cuda.device_count())

2


# Load datasets

In [2]:
file_path = 'adversarial_translated_sentences.json'
import json

# Read the file
with open(file_path, "r") as file:
    content = file.read()

# Split the content by '],', which separates the arrays
arrays = content[1:-2].split(']\n[\n')

# Initialize a list to store all dictionaries
all_dicts = []

for array in arrays:
    # Clean up the array (add the missing closing bracket and strip whitespace)
    array = '[' + array.strip() + ']'
    try:
        dicts = json.loads(array)
        all_dicts.extend(dicts)
    except json.JSONDecodeError:
        print(f"Error decoding JSON: {array}")

In [3]:
len(all_dicts)

65389

In [4]:
file_path = 'vanilla_translated_sentences.json'
import json

# Read the file
with open(file_path, "r") as file:
    content = file.read()

# Split the content by '],', which separates the arrays
arrays = content[1:-2].split(']\n[\n')

# Initialize a list to store all dictionaries
vanilla_all_dicts = []

for array in arrays:
    # Clean up the array (add the missing closing bracket and strip whitespace)
    array = '[' + array.strip() + ']'
    try:
        dicts = json.loads(array)
        vanilla_all_dicts.extend(dicts)
    except json.JSONDecodeError:
        print(f"Error decoding JSON: {array}")

In [5]:
len(vanilla_all_dicts)

65389

In [6]:
import pandas as pd

dataset = pd.read_csv("sampled_wildjailbreaks.csv", index_col=0)
dataset["ru_vanilla"] = [col["translated"] for col in vanilla_all_dicts]
dataset["ru_adversarial"] = [col["translated"] for col in all_dicts]

dataset = dataset.drop(columns = ["completion", "vanilla", "adversarial"])

In [7]:
dataset['texts'] = [line[2] if 'adv' in line[0] else line[1] for line in dataset.values]

# Train/val/test split

In [9]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import numpy as np

# Example dataset
texts = dataset['texts'].values
labels = dataset['data_type'].values

# Step 1: Split the dataset while maintaining class distribution
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, stratify=train_labels, random_state=42)

In [11]:
import pandas as pd

# Combine texts and labels into dataframes
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
valid_df = pd.DataFrame({'text': val_texts, 'label': val_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

# Save to CSV files
train_df.to_csv('train_data.csv', index=False)
valid_df.to_csv('valid_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

# TF-IDF embeds

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your data

# Fit the vectorizer on the training data and transform all splits
train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
valid_tfidf = tfidf_vectorizer.transform(val_texts)
test_tfidf = tfidf_vectorizer.transform(test_texts)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Example assuming your embeddings and labels are already split:
# train_embeds, test_embeds, train_labels, test_labels

# Step 1: Initialize the classifier
clf = LogisticRegression(max_iter=1000, random_state=42)

# Step 2: Train the classifier on the training data
clf.fit(train_tfidf, train_labels)

# Step 3: Predict on the test data
test_preds = clf.predict(test_tfidf)

# Step 4: Evaluate the classifier
# Accuracy
accuracy = accuracy_score(test_labels, test_preds)
print(f"Accuracy: {accuracy:.4f}")

# Detailed classification report (precision, recall, f1-score)
print("Classification Report:")
print(classification_report(test_labels, test_preds))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))

Accuracy: 0.8988
Classification Report:
                     precision    recall  f1-score   support

 adversarial_benign       0.87      0.85      0.86      3937
adversarial_harmful       0.87      0.88      0.87      4137
     vanilla_benign       0.94      0.96      0.95      2502
    vanilla_harmful       0.96      0.94      0.95      2502

           accuracy                           0.90     13078
          macro avg       0.91      0.91      0.91     13078
       weighted avg       0.90      0.90      0.90     13078

Confusion Matrix:
[[3360  548   20    9]
 [ 495 3632    3    7]
 [   8    1 2413   80]
 [  10    3  140 2349]]


# BGE-m3 embeds

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("deepvk/USER-bge-m3").half().to("cuda")

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Create a custom Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create DataLoaders for train, test, and validation sets
batch_size = 100

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)
test_dataset = TextDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [10]:
# Function to embed texts using the model
def embed_texts(dataloader):
    all_embeddings = []
    all_labels = []
    for texts_batch, labels_batch in tqdm(dataloader):
        embeddings = model.encode(texts_batch, batch_size=batch_size)
        all_embeddings.append(embeddings)
        all_labels.append(labels_batch)

    # Combine all batches
    return np.vstack(all_embeddings), np.hstack(all_labels)

# Embed all texts from train, validation, and test sets
train_embeddings, train_emb_labels = embed_texts(train_loader)
val_embeddings, val_emb_labels = embed_texts(val_loader)
test_embeddings, test_emb_labels = embed_texts(test_loader)

# Now you have embedded texts for train, validation, and test sets
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Validation embeddings shape: {val_embeddings.shape}")
print(f"Test embeddings shape: {test_embeddings.shape}")

100%|█████████████████████████████████████████| 471/471 [52:55<00:00,  6.74s/it]
100%|███████████████████████████████████████████| 53/53 [05:47<00:00,  6.56s/it]
100%|█████████████████████████████████████████| 131/131 [15:12<00:00,  6.97s/it]

Train embeddings shape: (47079, 1024)
Validation embeddings shape: (5232, 1024)
Test embeddings shape: (13078, 1024)





In [31]:
sum(test_labels == "vanilla_harmful")

2502

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the classifier
clf = LogisticRegression(max_iter=1000, random_state=42)

# Train the classifier on the training data
clf.fit(train_embeddings, train_labels)

# Predict on the test data
test_preds = clf.predict(test_embeddings)

# Evaluate the classifier
# Accuracy
accuracy = accuracy_score(test_labels, test_preds)
print(f"Accuracy: {accuracy:.4f}")

# Detailed classification report (precision, recall, f1-score)
print("Classification Report:")
print(classification_report(test_labels, test_preds))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(test_labels, test_preds))

Accuracy: 0.3147
Classification Report:
                     precision    recall  f1-score   support

 adversarial_benign       0.32      0.44      0.37      3937
adversarial_harmful       0.31      0.57      0.40      4137
     vanilla_benign       0.34      0.01      0.02      2502
    vanilla_harmful       0.23      0.01      0.02      2502

           accuracy                           0.31     13078
          macro avg       0.30      0.26      0.20     13078
       weighted avg       0.30      0.31      0.25     13078

Confusion Matrix:
[[1713 2145   33   46]
 [1754 2347    8   28]
 [1025 1428   27   22]
 [ 913 1548   12   29]]
