# Fine-tuning Test

In [None]:
from datasets import *

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, get_peft_model, LoraConfig
import evaluate


# dataset = load_dataset('lmsys/toxic-chat', 'toxicchat0124').shuffle()
# dataset = dataset.rename_column("user_input", "text")
# dataset = dataset.rename_column("jailbreaking", "label")
# dataset = dataset.remove_columns("conv_id")
# dataset = dataset.remove_columns("model_output")
# dataset = dataset.remove_columns("human_annotation")
# dataset = dataset.remove_columns("toxicity")
# dataset = dataset.remove_columns("openai_moderation")

# dataset = load_dataset('jackhhao/jailbreak-classification').shuffle()
# dataset = dataset.rename_column("prompt", "text")
# dataset = dataset.rename_column("type", "label")
# dataset = dataset.cast_column("label", datasets.ClassLabel(names=["benign", "jailbreak"]))

dataset = load_dataset('deepset/prompt-injections').shuffle()

# dataset = load_dataset('xTRam1/safe-guard-prompt-injection').shuffle()

dataset["test"][0]


In [None]:
model_output = 'evaluation'
model_checkpoint = 'distilbert/distilbert-base-uncased'
adapter_name = 'cyrp/distilbert-base-uncased-lora-text-classification-safeguard-promptinjection'

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
model = PeftModel.from_pretrained(model, adapter_name)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [160]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset

In [162]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [163]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [164]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=16,
                        lora_alpha=32,
                        lora_dropout=0.05
                        ,target_modules=["q_lin", "k_lin", "v_lin"]
                        )

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [166]:
# hyperparameters
lr = 1e-3
batch_size = 2
num_epochs = 5

In [167]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_output,
    learning_rate=lr,
    auto_find_batch_size=True,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [168]:
from sklearn.metrics import f1_score, roc_auc_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    auprc = roc_auc_score(labels, preds, average='weighted')
    return {
    'f1': f1,
    'auprc': auprc
    }

In [169]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

In [None]:
# evaluate model
trainer.evaluate()