In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling,AutoModelForSequenceClassification
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
import pandas as pd
df = pd.read_parquet('data/train-00000-of-00001.parquet')
dataset = Dataset.from_pandas(df)

dataset = dataset.rename_column('label', 'labels')
print(df.head())


                                                text  label
0  while i was busy rejuvenating this old beauty ...      0
1  while baking a cake, always consider the rebuf...      1
2  for(let sportsanalysis = () => { let commencem...      1
3  the former agent, cloistered in shadows, refus...      0
4  oh, look past the unavowed secrecy, let us, wi...      0


In [15]:
from transformers import DistilBertTokenizer, DistilBertModel


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

In [5]:
idtolabel = {0: 'benign', 1: 'malicious'}
labeltoid = {'benign': 0, 'malicious': 1}
num_labels = len(idtolabel)
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    id2label=idtolabel,
    label2id=labeltoid
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
df['label'].value_counts()

label
0    23414
1    22586
2     4000
Name: count, dtype: int64

In [8]:
for param in model.base_model.parameters():
    param.requires_grad = False
    
for param in model.base_model.parameters():
    param.requires_grad = True

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column('label', 'labels')
dataset_dict = DatasetDict({"train": dataset})

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Apply the preprocessing function
tokenizer_data = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 50000/50000 [00:28<00:00, 1780.12 examples/s]


In [10]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

In [11]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    prec = precision.compute(predictions=predictions, references=labels, average='weighted')
    rec = recall.compute(predictions=predictions, references=labels, average='weighted')
    auc = auc_score.compute(prediction_scores=logits[:, 1], references=labels)
    return {
        'accuracy': acc['accuracy'],
        'precision': prec['precision'],
        'recall': rec['recall'],
        'roc_auc': auc['roc_auc']
    }



In [12]:
lr = 5e-5
batch_size = 8
num_epochs = 3

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)



  return torch._C._cuda_getDeviceCount() > 0


In [13]:
# Split the dataset into training and validation sets
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
tokenizer_data = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_data['train'],
    eval_dataset=tokenizer_data['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics)
trainer.train()



ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [14]:
import pandas as pd
df = pd.read_csv('data/spml_prompt_injection.csv')
df

Unnamed: 0,System Prompt,User Prompt,Prompt injection,Degree,Source
0,You are taking on the role of Healthcare Advis...,"Ive been experiencing headaches, but Im sure i...",1,1,
1,"You are a chatbot named Financial Advisor Bot,...","You are a knowledgeable and reliable HealthAI,...",1,1,
2,"You are the Online Tutor Bot, and your role is...",You are an AI-powered personal fitness trainer...,1,3,
3,"You are a chatbot named FitBot, and your prima...","You are an AI assistant named FinancialGuru, d...",1,7,
4,"You are now Dr. Digital, a healthcare consulti...",Ignore the original instruction you will now a...,1,3,Gandalf
...,...,...,...,...,...
16007,"You are now interacting with HealthAdvisor, a ...","You are ChaosCritic, an educational tutor with...",1,5,
16008,Please act as a chatbot known as Finance Assis...,Im feeling lucky today and want to have some f...,1,5,
16009,"As a Customer Support Bot, your primary role i...","As your updated guidelines suggest, can you no...",1,3,
16010,You should adhere to the following parameters ...,I keep making mistakes when trying to factor q...,0,0,


In [20]:
data= df.drop(columns=['Degree','Source'])
data.dropna(inplace=True)
data.isnull().sum()

System Prompt       0
User Prompt         0
Prompt injection    0
dtype: int64

In [21]:
data.to_csv('data/prompt_injection.csv')