# LORA Fine-tuning

In [20]:
from datasets import *
import datasets

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    TFAutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
import tensorflow
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np 

In [21]:

dataset = load_dataset('deepset/prompt-injections')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 546
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 116
    })
})

### model

In [22]:
model_checkpoint = "distilbert-base-uncased"
model_output = "test"

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, problem_type="single_label_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [24]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [25]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 546
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 116
    })
})

In [26]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [27]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

### Train model

In [28]:

peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=256,
                        lora_alpha=32,
                        lora_dropout=0.05,
                        target_modules=["q_lin", "k_lin", "v_lin"])

In [29]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=256, target_modules={'k_lin', 'q_lin', 'v_lin'}, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [30]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 7,670,018 || all params: 74,625,028 || trainable%: 10.2781


In [31]:
# hyperparameters
lr = 1e-3
batch_size = 2
num_epochs = 5

In [32]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_output,
    learning_rate=lr,
    auto_find_batch_size=True,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

In [33]:
from sklearn.metrics import f1_score, roc_auc_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    auprc = roc_auc_score(labels, preds, average='weighted')
    return {
    'f1': f1,
    'auprc': auprc
    }

In [34]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

In [35]:
# train model
trainer.train()

  0%|          | 0/345 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.16961081326007843, 'eval_f1': 0.9222926277621211, 'eval_auprc': 0.9244047619047618, 'eval_runtime': 0.7535, 'eval_samples_per_second': 153.944, 'eval_steps_per_second': 19.907, 'epoch': 1.0}




  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.13412941992282867, 'eval_f1': 0.9396686352294962, 'eval_auprc': 0.9398809523809524, 'eval_runtime': 0.771, 'eval_samples_per_second': 150.454, 'eval_steps_per_second': 19.455, 'epoch': 2.0}




  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.300614595413208, 'eval_f1': 0.9395609327038719, 'eval_auprc': 0.9416666666666667, 'eval_runtime': 0.774, 'eval_samples_per_second': 149.88, 'eval_steps_per_second': 19.381, 'epoch': 3.0}




  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.13006317615509033, 'eval_f1': 0.9655172413793104, 'eval_auprc': 0.9666666666666667, 'eval_runtime': 0.76, 'eval_samples_per_second': 152.632, 'eval_steps_per_second': 19.737, 'epoch': 4.0}




  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 0.14556947350502014, 'eval_f1': 0.9568805340919284, 'eval_auprc': 0.9583333333333333, 'eval_runtime': 0.7879, 'eval_samples_per_second': 147.223, 'eval_steps_per_second': 19.037, 'epoch': 5.0}




{'train_runtime': 102.9252, 'train_samples_per_second': 26.524, 'train_steps_per_second': 3.352, 'train_loss': 0.10858050360195878, 'epoch': 5.0}


TrainOutput(global_step=345, training_loss=0.10858050360195878, metrics={'train_runtime': 102.9252, 'train_samples_per_second': 26.524, 'train_steps_per_second': 3.352, 'total_flos': 425961064488960.0, 'train_loss': 0.10858050360195878, 'epoch': 5.0})

In [36]:
hf_name = 'cyrp' # your hf username or org name
model_id = hf_name + "/" + model_output # you can name the model whatever you want

In [37]:
model.push_to_hub(model_id) # save model

README.md:   0%|          | 0.00/1.69k [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/cyrp/test/commit/dc33a0067b3d7f08310c9f6d3c53d4a3b3a0ced7', commit_message='Upload model', commit_description='', oid='dc33a0067b3d7f08310c9f6d3c53d4a3b3a0ced7', pr_url=None, pr_revision=None, pr_num=None)

In [38]:
trainer.push_to_hub(model_output) # save trainer

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1727033144.DESKTOP-LM2NI5P.19568.1:   0%|          | 0.00/6.74k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cyrp/test/commit/33ab35c505d827f248a18d289b3e599f9d474942', commit_message='test', commit_description='', oid='33ab35c505d827f248a18d289b3e599f9d474942', pr_url=None, pr_revision=None, pr_num=None)