In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [None]:

model_name = "jackhhao/jailbreak-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = AutoModelForSequenceClassification.from_pretrained(model_name)



In [2]:
bert.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
from datasets import load_dataset

dataset = load_dataset("jackhhao/jailbreak-classification")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['prompt', 'type'],
        num_rows: 1044
    })
    test: Dataset({
        features: ['prompt', 'type'],
        num_rows: 262
    })
})


In [4]:
# see the train data
df = dataset['train'].to_pandas()

In [5]:
df.head()

Unnamed: 0,prompt,type
0,You are a devoted fan of a celebrity.,benign
1,You are Joseph Seed from Far Cry 5. Sermonize ...,benign
2,"As Rei Ayanami from Neon Genesis Evangelion, t...",benign
3,"You are a medieval blacksmith named Wulfric, l...",benign
4,"Answer the following question: So, I worked wi...",benign


In [6]:
df.value_counts('type')

type
jailbreak    527
benign       517
Name: count, dtype: int64

In [13]:
print(bert.config.label2id)

{'benign': 0, 'jailbreak': 1}


In [14]:
test_dataset = dataset["test"]

# Préparer des listes pour stocker les prédictions et les labels réels
predictions = []
true_labels = []

# Itérer sur le dataset test (pour de meilleurs temps, pensez à faire du batch processing)
for example in test_dataset:
    # Préparer l'entrée avec le tokenizer
    inputs = tokenizer(example["prompt"], return_tensors="pt", truncation=True, padding=True)
    
    # Désactiver le calcul des gradients pour l'inférence
    with torch.no_grad():
        logits = bert(**inputs).logits
    
    # Prendre l'indice avec le score maximum
    pred = torch.argmax(logits, dim=1).item()
    
    predictions.append(bert.config.id2label[pred])
    true_labels.append(example["type"])

# Calculer et afficher les métriques d'évaluation
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score (weighted):", f1)
print("\nClassification Report:\n", classification_report(true_labels, predictions))


Accuracy: 0.9732824427480916
F1 Score (weighted): 0.9732883002344445

Classification Report:
               precision    recall  f1-score   support

      benign       0.97      0.98      0.97       123
   jailbreak       0.98      0.97      0.97       139

    accuracy                           0.97       262
   macro avg       0.97      0.97      0.97       262
weighted avg       0.97      0.97      0.97       262



In [23]:
# Sauvegarder le modèle et le tokenizer dans un dossier local
bert.save_pretrained("./saved_models/bert")
tokenizer.save_pretrained("./saved_models/bert")


('./saved_models/bert\\tokenizer_config.json',
 './saved_models/bert\\special_tokens_map.json',
 './saved_models/bert\\vocab.txt',
 './saved_models/bert\\added_tokens.json',
 './saved_models/bert\\tokenizer.json')

# Other models

In [16]:
import os
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate





In [36]:


dataset = load_dataset("jackhhao/jailbreak-classification")

In [37]:

# ============================================================
# 1. Choix du modèle et chargement du dataset
# ============================================================

# Choisissez ici le modèle à fine-tuner (exemple : ALBERT)
model_name = "albert-base-v2"  
# Pour tester DeBERTa, vous pourriez utiliser par exemple "microsoft/deberta-v3-base"

# Certains datasets n'ont pas de split "validation", vous pouvez en créer un à partir du train :
if "validation" not in dataset.keys():
    dataset = dataset["train"].train_test_split(test_size=0.1)
    # On renomme le split test créé en "validation" pour l'évaluation durant l'entraînement
    dataset["validation"] = dataset["test"]
    del dataset["test"]


In [39]:
def convert_label(example):
    # Définir le mapping : ici, "benign" -> 0, "jailbreak" -> 1
    mapping = {"benign": 0, "jailbreak": 1}
    example["type"] = mapping[example["type"]]
    return example

# Appliquer la conversion sur le dataset
dataset = dataset.map(convert_label)


Map:   0%|          | 0/939 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [40]:

# ============================================================
# 2. Préparation des données
# ============================================================

# La dataset utilise "prompt" pour le texte et "type" pour l'étiquette.
# Pour faciliter le fine-tuning, on renomme "type" en "label".
def rename_label(example):
    example["label"] = example["type"]
    return example

dataset = dataset.map(rename_label)

# Charger le tokenizer du modèle choisi
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fonction de prétraitement (tokenization)
def preprocess_function(examples):
    return tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=128)

# Appliquer la tokenization sur tous les splits
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Pour travailler avec PyTorch, on fixe le format
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/939 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/939 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [41]:

# ============================================================
# 3. Définition des paramètres d'entraînement et des métriques
# ============================================================

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
)

# Définir une fonction de calcul des métriques
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1_weighted": f1["f1"]}




In [42]:

# ============================================================
# 4. Chargement du modèle et initialisation du Trainer
# ============================================================

# Charger le modèle pour la classification (on spécifie num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ============================================================
# 5. Lancement de l'entraînement et évaluation
# ============================================================

# Lancer le fine-tuning
trainer.train()


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/177 [00:00<?, ?it/s]

{'loss': 0.2963, 'grad_norm': 3.0932846069335938, 'learning_rate': 1.4350282485875708e-05, 'epoch': 0.85}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.24805359542369843, 'eval_accuracy': 0.9142857142857143, 'eval_f1_weighted': 0.9139729169653447, 'eval_runtime': 35.2383, 'eval_samples_per_second': 2.98, 'eval_steps_per_second': 0.199, 'epoch': 1.0}
{'loss': 0.1142, 'grad_norm': 0.41238075494766235, 'learning_rate': 8.700564971751413e-06, 'epoch': 1.69}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.1878332793712616, 'eval_accuracy': 0.9523809523809523, 'eval_f1_weighted': 0.9523895938921799, 'eval_runtime': 39.5743, 'eval_samples_per_second': 2.653, 'eval_steps_per_second': 0.177, 'epoch': 2.0}
{'loss': 0.053, 'grad_norm': 0.35943686962127686, 'learning_rate': 3.0508474576271192e-06, 'epoch': 2.54}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.23112255334854126, 'eval_accuracy': 0.9428571428571428, 'eval_f1_weighted': 0.9428051948051949, 'eval_runtime': 33.0189, 'eval_samples_per_second': 3.18, 'eval_steps_per_second': 0.212, 'epoch': 3.0}
{'train_runtime': 2712.9964, 'train_samples_per_second': 1.038, 'train_steps_per_second': 0.065, 'train_loss': 0.13784790241112144, 'epoch': 3.0}


TrainOutput(global_step=177, training_loss=0.13784790241112144, metrics={'train_runtime': 2712.9964, 'train_samples_per_second': 1.038, 'train_steps_per_second': 0.065, 'total_flos': 16830238164480.0, 'train_loss': 0.13784790241112144, 'epoch': 3.0})

In [43]:

# Évaluer sur l'ensemble de validation (ou test s'il existe)
eval_results = trainer.evaluate()
print("Résultats de l'évaluation :", eval_results)



  0%|          | 0/7 [00:00<?, ?it/s]

Résultats de l'évaluation : {'eval_loss': 0.23112255334854126, 'eval_accuracy': 0.9428571428571428, 'eval_f1_weighted': 0.9428051948051949, 'eval_runtime': 39.4543, 'eval_samples_per_second': 2.661, 'eval_steps_per_second': 0.177, 'epoch': 3.0}


In [44]:

# ============================================================
# Sauvegarder le modèle fine-tuné
# ============================================================
model_save_path = "./saved_models/albert"
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./saved_models/albert\\tokenizer_config.json',
 './saved_models/albert\\special_tokens_map.json',
 './saved_models/albert\\spiece.model',
 './saved_models/albert\\added_tokens.json',
 './saved_models/albert\\tokenizer.json')

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'

In [46]:
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)

In [None]:
dataset = load_dataset("jackhhao/jailbreak-classification")

test_dataset = dataset["test"]

# Préparer des listes pour stocker les prédictions et les labels réels
predictions = []
true_labels = []

# Itérer sur le dataset test (pour de meilleurs temps, pensez à faire du batch processing)
for example in test_dataset:
    # Préparer l'entrée avec le tokenizer
    inputs = tokenizer(example["prompt"], return_tensors="pt", truncation=True, padding=True)
    
    # Désactiver le calcul des gradients pour l'inférence
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Prendre l'indice avec le score maximum
    pred = torch.argmax(logits, dim=1).item()
    
    predictions.append(model.config.id2label[pred])
    true_labels.append(example["type"])



In [51]:
#covert the labels to the original labels
# {'LABEL_0': "benign", 'LABEL_1': "jailbreak"}

predictions = [x.replace("LABEL_0", "benign").replace("LABEL_1", "jailbreak") for x in predictions]


In [53]:

# Calculer et afficher les métriques d'évaluation
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score (weighted):", f1)
print("\nClassification Report:\n", classification_report(true_labels, predictions))

Accuracy: 0.9809160305343512
F1 Score (weighted): 0.9809000999768253

Classification Report:
               precision    recall  f1-score   support

      benign       0.99      0.97      0.98       123
   jailbreak       0.97      0.99      0.98       139

    accuracy                           0.98       262
   macro avg       0.98      0.98      0.98       262
weighted avg       0.98      0.98      0.98       262



# Deberta V3

In [45]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

dataset = load_dataset("jackhhao/jailbreak-classification")


# Load DeBERTa tokenizer and model
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # 2 labels: jailbreak/benign

def tokenize_function(example):
    return tokenizer(
        example['prompt'],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
#print(tokenized_dataset['train'][0])  # Should include input_ids and attention_mask



training_args = TrainingArguments(
    per_device_train_batch_size=4,   # Batch size per device
    per_device_eval_batch_size=4,
    output_dir="./deberta_results",  # Directory to save checkpoints
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model after each epoch
    learning_rate=2e-5,              # Recommended learning rate
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir="./deberta_logs",    # Directory for logs
    load_best_model_at_end=True,     # Automatically load best model
    metric_for_best_model="accuracy",
)


# Load the accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the class with the highest probability
    return metric.compute(predictions=predictions, references=labels)

def add_labels(example):
    example['labels'] = 1 if example['type'] == 'jailbreak' else 0
    return example

# Add the 'labels' column to the dataset
tokenized_dataset = tokenized_dataset.map(add_labels)

def add_labels(example):
    example['labels'] = 1 if example['type'] == 'jailbreak' else 0
    return example


trainer = Trainer(
    model=model,
    args=training_args,  # TrainingArguments defined earlier
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add this line
)



# Add the 'labels' column to the dataset
tokenized_dataset = tokenized_dataset.map(add_labels)
trainer.train()

results = trainer.evaluate()
print("Test Results:", results)

model_save_path = "./saved_models/deberta"
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1044 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]



Map:   0%|          | 0/1044 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

  trainer = Trainer(


Map:   0%|          | 0/1044 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

  0%|          | 0/783 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
dataset = load_dataset("jackhhao/jailbreak-classification")

test_dataset = dataset["test"]

# Préparer des listes pour stocker les prédictions et les labels réels
predictions = []
true_labels = []

# Itérer sur le dataset test (pour de meilleurs temps, pensez à faire du batch processing)
for example in test_dataset:
    # Préparer l'entrée avec le tokenizer
    inputs = tokenizer(example["prompt"], return_tensors="pt", truncation=True, padding=True)
    
    # Désactiver le calcul des gradients pour l'inférence
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Prendre l'indice avec le score maximum
    pred = torch.argmax(logits, dim=1).item()
    
    predictions.append(model.config.id2label[pred])
    true_labels.append(example["type"])

# Calculer et afficher les métriques d'évaluation
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score (weighted):", f1)
print("\nClassification Report:\n", classification_report(true_labels, predictions))
