# LORA Fine-tuning

In [1]:
from datasets import *

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [2]:
# load dataset
# dataset = load_dataset('codesagar/malicious-llm-prompts')
# dataset = dataset.remove_columns("__index_level_0__")
# dataset = dataset.remove_columns("attack_type")
# dataset = dataset.remove_columns("reasoning")

# dataset = dataset.cast(Features({"text": Value("string"), "label": Value("int64")}))
# dataset

dataset = load_dataset('lmsys/toxic-chat', 'toxicchat0124')
dataset = dataset.rename_column("user_input", "text")
dataset = dataset.rename_column("jailbreaking", "label")
dataset = dataset.remove_columns("conv_id")
dataset = dataset.remove_columns("model_output")
dataset = dataset.remove_columns("human_annotation")
dataset = dataset.remove_columns("toxicity")
dataset = dataset.remove_columns("openai_moderation")

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5082
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5083
    })
})

### model

In [3]:
model_checkpoint = 'distilbert-base-uncased'

# define label maps
id2label = {0: "Good", 1: "Jailbreak"}
label2id = {"Good": 0, "Jailbreak": 1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id, trust_remote_code=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [5]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [6]:
# create tokenize function
def tokenize(batch):

    #tokenize and truncate text
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    print(batch['label'])
    return tokens


# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset


Map:   0%|          | 0/5083 [00:00<?, ? examples/s]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5082
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5083
    })
})

In [7]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [8]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [9]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [10]:
# define list of examples
text_list = ["jailbreak [keyword] [TARGETLANGUAGE] [PROMPT]"]
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    #print(inputs)
    # compute logits
    logits = model(inputs).logits
    #print(logits)
    # convert logits to label
    predictions = torch.argmax(logits)
    #print(predictions)

    print(id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
Jailbreak
Jailbreak
Jailbreak
Jailbreak
Jailbreak
Jailbreak
Jailbreak
Jailbreak


### Train model

In [11]:

peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=8,
                        lora_alpha=8,
                        lora_dropout=0.05,
                        target_modules=["q_lin", "k_lin", "v_lin"])

In [12]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=8, target_modules={'k_lin', 'v_lin', 'q_lin'}, lora_alpha=8, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [13]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 813,314 || all params: 67,768,324 || trainable%: 1.2001


In [14]:
# hyperparameters
lr = 1e-6
batch_size = 2
num_epochs = 5

In [15]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [16]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics
)

In [17]:
# train model
trainer.train()

  0%|          | 0/12705 [00:00<?, ?it/s]

{'loss': 0.6523, 'grad_norm': 3.8567967414855957, 'learning_rate': 9.606454151908698e-07, 'epoch': 0.2}
{'loss': 0.467, 'grad_norm': 2.723874092102051, 'learning_rate': 9.212908303817394e-07, 'epoch': 0.39}
{'loss': 0.3314, 'grad_norm': 2.263256311416626, 'learning_rate': 8.819362455726092e-07, 'epoch': 0.59}
{'loss': 0.2409, 'grad_norm': 1.872744083404541, 'learning_rate': 8.425816607634789e-07, 'epoch': 0.79}
{'loss': 0.1514, 'grad_norm': 1.2151563167572021, 'learning_rate': 8.032270759543487e-07, 'epoch': 0.98}


  0%|          | 0/2542 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.9820971867007673}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.12431289255619049, 'eval_accuracy': {'accuracy': 0.9820971867007673}, 'eval_runtime': 79.1205, 'eval_samples_per_second': 64.244, 'eval_steps_per_second': 32.128, 'epoch': 1.0}




{'loss': 0.1244, 'grad_norm': 0.6836785674095154, 'learning_rate': 7.638724911452184e-07, 'epoch': 1.18}
{'loss': 0.1204, 'grad_norm': 0.4265017509460449, 'learning_rate': 7.245179063360881e-07, 'epoch': 1.38}
{'loss': 0.1027, 'grad_norm': 0.5325586199760437, 'learning_rate': 6.851633215269579e-07, 'epoch': 1.57}
{'loss': 0.1098, 'grad_norm': 0.292144238948822, 'learning_rate': 6.458087367178277e-07, 'epoch': 1.77}
{'loss': 0.0972, 'grad_norm': 0.2781635820865631, 'learning_rate': 6.064541519086973e-07, 'epoch': 1.97}


  0%|          | 0/2542 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.9820971867007673}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.08831635862588882, 'eval_accuracy': {'accuracy': 0.9820971867007673}, 'eval_runtime': 79.3697, 'eval_samples_per_second': 64.042, 'eval_steps_per_second': 32.027, 'epoch': 2.0}
{'loss': 0.1102, 'grad_norm': 0.16349561512470245, 'learning_rate': 5.670995670995671e-07, 'epoch': 2.16}
{'loss': 0.0806, 'grad_norm': 0.15153397619724274, 'learning_rate': 5.277449822904369e-07, 'epoch': 2.36}
{'loss': 0.1238, 'grad_norm': 0.10105300694704056, 'learning_rate': 4.883903974813065e-07, 'epoch': 2.56}
{'loss': 0.1327, 'grad_norm': 0.13112109899520874, 'learning_rate': 4.490358126721763e-07, 'epoch': 2.75}
{'loss': 0.1032, 'grad_norm': 0.11866582930088043, 'learning_rate': 4.0968122786304606e-07, 'epoch': 2.95}


  0%|          | 0/2542 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.9820971867007673}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.09498069435358047, 'eval_accuracy': {'accuracy': 0.9820971867007673}, 'eval_runtime': 79.3049, 'eval_samples_per_second': 64.094, 'eval_steps_per_second': 32.053, 'epoch': 3.0}




{'loss': 0.0812, 'grad_norm': 0.060463402420282364, 'learning_rate': 3.7032664305391577e-07, 'epoch': 3.15}
{'loss': 0.1091, 'grad_norm': 0.1785110980272293, 'learning_rate': 3.3097205824478554e-07, 'epoch': 3.35}
{'loss': 0.1048, 'grad_norm': 0.07040317356586456, 'learning_rate': 2.9161747343565526e-07, 'epoch': 3.54}
{'loss': 0.1617, 'grad_norm': 0.0796733871102333, 'learning_rate': 2.52262888626525e-07, 'epoch': 3.74}
{'loss': 0.1153, 'grad_norm': 0.03725482150912285, 'learning_rate': 2.129083038173947e-07, 'epoch': 3.94}


  0%|          | 0/2542 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.9820971867007673}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.09886438399553299, 'eval_accuracy': {'accuracy': 0.9820971867007673}, 'eval_runtime': 79.2707, 'eval_samples_per_second': 64.122, 'eval_steps_per_second': 32.067, 'epoch': 4.0}




{'loss': 0.0971, 'grad_norm': 0.08040221035480499, 'learning_rate': 1.7355371900826445e-07, 'epoch': 4.13}
{'loss': 0.1284, 'grad_norm': 0.08809361606836319, 'learning_rate': 1.341991341991342e-07, 'epoch': 4.33}
{'loss': 0.1032, 'grad_norm': 0.11799431592226028, 'learning_rate': 9.484454939000392e-08, 'epoch': 4.53}
{'loss': 0.1699, 'grad_norm': 0.019832558929920197, 'learning_rate': 5.5489964580873666e-08, 'epoch': 4.72}
{'loss': 0.1026, 'grad_norm': 0.04426286369562149, 'learning_rate': 1.6135379771743405e-08, 'epoch': 4.92}


  0%|          | 0/2542 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.9820971867007673}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.09987545013427734, 'eval_accuracy': {'accuracy': 0.9820971867007673}, 'eval_runtime': 79.2019, 'eval_samples_per_second': 64.178, 'eval_steps_per_second': 32.095, 'epoch': 5.0}




{'train_runtime': 1334.5418, 'train_samples_per_second': 19.04, 'train_steps_per_second': 9.52, 'train_loss': 0.1640547661178931, 'epoch': 5.0}


TrainOutput(global_step=12705, training_loss=0.1640547661178931, metrics={'train_runtime': 1334.5418, 'train_samples_per_second': 19.04, 'train_steps_per_second': 9.52, 'total_flos': 3425650679643360.0, 'train_loss': 0.1640547661178931, 'epoch': 5.0})

### Generate prediction

In [19]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
Good
Good
Good
Good
Good
Good
Good
Good


### Optional: push model to hub

In [38]:
# option 1: notebook login
# from huggingface_hub import notebook_login
# notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

In [20]:
hf_name = 'cyrp' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [21]:
model.push_to_hub(model_id) # save model



adapter_model.safetensors:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cyrp/distilbert-base-uncased-lora-text-classification/commit/e5fa741ab6df373c46f356582683d23c84319e9f', commit_message='Upload model', commit_description='', oid='e5fa741ab6df373c46f356582683d23c84319e9f', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
trainer.push_to_hub("distilbert-base-uncased-lora-text-classification-jailbreak-toxicchat") # save trainer

events.out.tfevents.1726168610.DESKTOP-LM2NI5P.23688.0:   0%|          | 0.00/8.89k [00:00<?, ?B/s]

events.out.tfevents.1726833438.DESKTOP-LM2NI5P.30988.0:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

events.out.tfevents.1726147136.DESKTOP-LM2NI5P.15232.0:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

events.out.tfevents.1726237440.DESKTOP-LM2NI5P.19052.0:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

events.out.tfevents.1726224042.DESKTOP-LM2NI5P.13160.0:   0%|          | 0.00/8.89k [00:00<?, ?B/s]

events.out.tfevents.1726833884.DESKTOP-LM2NI5P.7256.0:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cyrp/distilbert-base-uncased-lora-text-classification/commit/e26ada3bd249f759f8c532c8e72de97cd5ca2061', commit_message='distilbert-base-uncased-lora-text-classification-jailbreak-toxicchat', commit_description='', oid='e26ada3bd249f759f8c532c8e72de97cd5ca2061', pr_url=None, pr_revision=None, pr_num=None)

### Optional: load peft model

In [25]:
# how to load peft model from hub for inference
# config = PeftConfig.from_pretrained(model_id)
# inference_model = AutoModelForSequenceClassification.from_pretrained(
#     config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
# )
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# model = PeftModel.from_pretrained(inference_model, model_id)