In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
import evaluate
import pandas as pd
import torch

In [2]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [5]:
df = pd.read_parquet('/content/prompt-injection-defense/data/train-00000-of-00001.parquet')
dataset = Dataset.from_pandas(df)

dataset = dataset.rename_column('label', 'labels')
print(df.head())

                                                text  label
0  while i was busy rejuvenating this old beauty ...      0
1  while baking a cake, always consider the rebuf...      1
2  for(let sportsanalysis = () => { let commencem...      1
3  the former agent, cloistered in shadows, refus...      0
4  oh, look past the unavowed secrecy, let us, wi...      0


In [8]:
# Split into train/validation (before tokenization)
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [9]:
from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [18]:
# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
idtolabel = {0: 'benign', 1: 'malicious', 2: 'unknown'}
labeltoid = {'benign': 0, 'malicious': 1, 'unknown': 2}
num_labels = len(idtolabel)
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    id2label=idtolabel,
    label2id=labeltoid,
    dropout=0.3,
    attention_dropout=0.3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
if torch.cuda.is_available():
    model.to('cuda')
    print("Model moved to GPU:", torch.cuda.get_device_name(0))
else:
    print("GPU not available, using CPU.")


Model moved to GPU: Tesla T4


In [13]:

# Preprocess function (no padding here—let collator handle it)
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

# Tokenize the split datasets
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

# Data collator for classification (dynamic padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    prec = precision.compute(predictions=predictions, references=labels, average='weighted')
    rec = recall.compute(predictions=predictions, references=labels, average='weighted')

    try:
        auc = auc_score.compute(
            prediction_scores=logits,
            references=labels,
            multi_class='ovr',
            average='weighted'
        )
        roc_auc = auc['roc_auc']
    except Exception as e:
        print("AUC computation error:", e)
        roc_auc = None

    return {
        'accuracy': acc['accuracy'],
        'precision': prec['precision'],
        'recall': rec['recall'],
    }


Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [14]:
lr = 2e-5
batch_size = 8
num_epochs = 5

training_args = TrainingArguments(
    output_dir="/content/prompt-injection-defense/results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.05,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)

In [19]:
# Trainer
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [20]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.0747,0.064952,0.9864,0.986417,0.9864
2,0.0359,0.062185,0.99,0.99001,0.99
3,0.0363,0.059518,0.9908,0.990821,0.9908
4,0.0124,0.060204,0.9918,0.991808,0.9918
5,0.0061,0.062861,0.992,0.992004,0.992


AUC computation error: Module inputs don't match the expected format.
Expected format: {'references': Value('int32'), 'prediction_scores': Value('float32')},
Input references: [0 1 1 ... 0 1 1],
Input prediction_scores: [[ 5.2695312 -1.4521484 -6.2578125]
 [-3.4882812  6.203125  -5.7773438]
 [-3.5332031  6.25      -5.8632812]
 ...
 [ 5.6210938 -1.8457031 -6.25     ]
 [-3.46875    6.2890625 -5.9257812]
 [-3.328125   6.2109375 -6.0351562]]
AUC computation error: Module inputs don't match the expected format.
Expected format: {'references': Value('int32'), 'prediction_scores': Value('float32')},
Input references: [0 1 1 ... 0 1 1],
Input prediction_scores: [[ 5.578125  -1.3408203 -9.1171875]
 [-4.390625   7.359375  -7.359375 ]
 [-4.78125    7.3671875 -6.8632812]
 ...
 [ 6.1757812 -2.2480469 -8.9375   ]
 [-4.6601562  7.3671875 -7.0078125]
 [-4.5859375  7.4101562 -7.2460938]]
AUC computation error: Module inputs don't match the expected format.
Expected format: {'references': Value('int32')

TrainOutput(global_step=28125, training_loss=0.045073109415902035, metrics={'train_runtime': 1486.2925, 'train_samples_per_second': 151.383, 'train_steps_per_second': 18.923, 'total_flos': 8734362645517200.0, 'train_loss': 0.045073109415902035, 'epoch': 5.0})

In [21]:
test_df = pd.read_parquet('/content/prompt-injection-defense/data/test-00000-of-00001.parquet')
test_dataset = Dataset.from_pandas(test_df)


In [22]:
test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [23]:
metrics = trainer.evaluate(test_dataset)
print(metrics)


AUC computation error: Module inputs don't match the expected format.
Expected format: {'references': Value('int32'), 'prediction_scores': Value('float32')},
Input references: [1 0 0 ... 2 2 2],
Input prediction_scores: [[ -5.765625    8.0234375  -9.1015625]
 [  7.1367188  -5.2695312  -7.7382812]
 [  6.84375    -3.0664062 -10.25     ]
 ...
 [ -5.5625     -5.9023438   9.9921875]
 [ -5.6367188  -5.9375     10.1015625]
 [ -5.8125     -5.8203125  10.109375 ]]
{'eval_loss': 0.056682176887989044, 'eval_accuracy': 0.9918, 'eval_precision': 0.9917982629837703, 'eval_recall': 0.9918, 'eval_runtime': 13.6986, 'eval_samples_per_second': 730.004, 'eval_steps_per_second': 91.25, 'epoch': 5.0}


In [24]:
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

test_df["predicted_label"] = preds
test_df.to_csv("test_predictions.csv", index=False)


AUC computation error: Module inputs don't match the expected format.
Expected format: {'references': Value('int32'), 'prediction_scores': Value('float32')},
Input references: [1 0 0 ... 2 2 2],
Input prediction_scores: [[ -5.765625    8.0234375  -9.1015625]
 [  7.1367188  -5.2695312  -7.7382812]
 [  6.84375    -3.0664062 -10.25     ]
 ...
 [ -5.5625     -5.9023438   9.9921875]
 [ -5.6367188  -5.9375     10.1015625]
 [ -5.8125     -5.8203125  10.109375 ]]


In [25]:
trainer.save_model("/content/prompt-injection-defense/saved_model")
tokenizer.save_pretrained("/content/prompt-injection-defense/saved_model")


('/content/prompt-injection-defense/saved_model/tokenizer_config.json',
 '/content/prompt-injection-defense/saved_model/special_tokens_map.json',
 '/content/prompt-injection-defense/saved_model/vocab.txt',
 '/content/prompt-injection-defense/saved_model/added_tokens.json')