# Fine-tuning Test

In [209]:
from datasets import *
import datasets

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, get_peft_model, LoraConfig
import evaluate


# dataset = load_dataset('lmsys/toxic-chat', 'toxicchat0124').shuffle()
# dataset = dataset.rename_column("user_input", "text")
# dataset = dataset.rename_column("jailbreaking", "label")
# dataset = dataset.remove_columns("conv_id")
# dataset = dataset.remove_columns("model_output")
# dataset = dataset.remove_columns("human_annotation")
# dataset = dataset.remove_columns("toxicity")
# dataset = dataset.remove_columns("openai_moderation")

# dataset = load_dataset('jackhhao/jailbreak-classification').shuffle()
# dataset = dataset.rename_column("prompt", "text")
# dataset = dataset.rename_column("type", "label")
# dataset = dataset.cast_column("label", datasets.ClassLabel(names=["benign", "jailbreak"]))

# dataset = load_dataset('deepset/prompt-injections').shuffle()

dataset = load_dataset('xTRam1/safe-guard-prompt-injection').shuffle()

dataset["test"][0]


 'label': 1}

In [210]:
model_output = 'evaluation'
model_checkpoint = 'distilbert/distilbert-base-uncased'
adapter_name = 'cyrp/distilbert-base-uncased-deepset-promptinjection-2'

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
model = PeftModel.from_pretrained(model, adapter_name)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [211]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [212]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8236
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2060
    })
})

In [213]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [214]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [215]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=16,
                        lora_alpha=32,
                        lora_dropout=0.05
                        ,target_modules=["q_lin", "k_lin", "v_lin"]
                        )

In [216]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,034,498 || all params: 67,989,508 || trainable%: 1.5216


In [217]:
# hyperparameters
lr = 1e-3
batch_size = 2
num_epochs = 5

In [218]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_output,
    learning_rate=lr,
    auto_find_batch_size=True,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [219]:
# show metrics for f1 and auprc
from sklearn.metrics import f1_score, precision_recall_curve, auc

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    precision, recall, _ = precision_recall_curve(labels, preds)
    auprc = auc(recall, precision)
    return {
    'f1': f1,
    'auprc': auprc
    }

In [220]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

In [221]:
# evaluate model
trainer.evaluate()

  0%|          | 0/258 [00:00<?, ?it/s]

{'eval_loss': 8.151127815246582,
 'eval_f1': 0.4083396095994207,
 'eval_auprc': 0.6632215357458077,
 'eval_runtime': 30.6105,
 'eval_samples_per_second': 67.297,
 'eval_steps_per_second': 8.428}