# Training PEFT models with new tokens being added to the embedding layers and tokenizer

In this example, we will learn how to train a LoRA model when adding new tokens to the tokenizer and model.
This is a common usecase when doing the following:
1. Instruction finetuning with new tokens beind added such as `<|user|>`, `<|assistant|>`, `<|system|>`, `</s>`, `<s>` to properly format the conversations
2. Finetuning on a specific language wherein language spoecific tokens are added, e.g., korean tokens being added to vocabulary for finetuning LLM on Korean datasets.
3. Instruction finetuning to return outputs in certain format to enable agent behaviour new tokens such as `<|FUNCTIONS|>`, `<|BROWSE|>`, `<|TEXT2IMAGE|>`, `<|ASR|>`, `<|TTS|>`, `<|GENERATECODE|>`, `<|RAG|>`.

In such cases, you add the Embedding modules to the LORA `target_modules`. PEFT will take care of saving the embedding layers with the new added tokens along with the adapter weights that were trained on the specific initialization of the embeddings weights of the added tokens.

In [1]:
%pip install -q git+https://github.com/huggingface/peft

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m35.3 MB/s[0m eta [36m0:0

In [2]:
%pip install -q dataclass-csv

In [3]:
import os

import transformers
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
import torch
from dataclasses import dataclass, field
from typing import Optional
from dataclass_csv import DataclassReader
from torch.utils.data import Dataset, DataLoader

from enum import Enum

## Prepare Model and Tokenizer

Now, we will be adding 27 new tokens as well as replace the existing pad, bos and eos tokens of the model.

In [4]:
class SpecialTokens(str, Enum):
    begin_target = "<|begintarget|>"
    end_target = "<|endtarget|>"
    begin_context = "<|begincontext|>"
    end_context = "<|endcontext|>"
    system = "<|system|>"
    user = "<|user|>"
    begin_last_user_utterance = "<|beginlastuserutterance|>"
    end_last_user_utterance = "<|endlastuserutterance|>"
    begin_dsts = "<|begindsts|>"
    end_dsts = "<|enddsts|>"
    begin_dst = "<|begindst|>"
    end_dst = "<|enddst|>"
    begin_belief = "<|beginbelief|>"
    end_belief = "<|endbelief|>"
    begin_response = "<|beginresponse|>"
    end_response = "<|endresponse|>"
    begin_action = "<|beginaction|>"
    end_action = "<|endaction|>"
    begin_user_action = "<|beginuseraction|>"
    end_user_action = "<|enduseraction|>"
    sys_actions = "<|sysactions|>"
    begin_intent = "<|beginintent|>"
    end_intent = "<|endintent|>"
    begin_requested_slots = "<|beginrequestedslots|>"
    end_requested_slots = "<|endrequestedslots|>"
    pad_token = "<|pad|>"
    bos_token = "<|startoftext|>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

We will be finetuning `Qwen2.5-0.5B` model. Let's load the tokenizer and add the special tokens followed by loading the base model and resizzing the embedding layers to accomodate the newly added tokens.

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token=SpecialTokens.pad_token.value,  # Специальный токен для дополнения текста
    bos_token=SpecialTokens.bos_token.value,  # Токен начала последовательности
    eos_token=SpecialTokens.end_target.value, # Токен конца последовательности
    additional_special_tokens=SpecialTokens.list(), # Дополнительные специальные токены
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True
    # use_flash_attention_2=True,
)

# Изменяем размер эмбеддингов под наш токенизатор
# Это нужно, так как добавили специальные токены
model.resize_token_embeddings(len(tokenizer))

model.to('cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151692, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((2048,), eps=1e-06)
    (rotary_emb):

## Apply LoRA

In [8]:
config = LoraConfig(
    r=64,               # Ранг разложения матриц
    lora_alpha=128,     # Коэффициент масштабирования
    lora_dropout=0.0,
    target_modules=[
        "embed_tokens",
        "lm_head",
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)

# Применение LoRA к модели
model = get_peft_model(model, config)

print(model.print_trainable_parameters())


print(model)

trainable params: 139,412,992 || all params: 3,224,851,968 || trainable%: 4.3231
None
PeftModel(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(151692, 2048)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 64x151692 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 2048x64 (cuda:0)])
          (lora_magnitude_vector): ModuleDict()
        )
        (layers): ModuleList(
          (0-35): 36 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=True)
                (lora_dro



## Prepare Dataset

In [None]:
%pip install -q datasets

In [11]:
from datasets import load_dataset


dataset = load_dataset("smangrul/assistant_chatbot_dataset")
dataset = dataset["train"].train_test_split(0.2)

text_column = "context"  # входные данные
label_column = "target"  # целевые данные
max_length = 512         # максимальная длина последовательности

def preprocess_function(examples):
    """Функция предобработки данных для обучения модели чат-бота"""
    batch_size = len(examples[text_column])
    targets = [str(x) for x in examples[label_column]]

    model_inputs = tokenizer(examples[text_column])
    labels = tokenizer(targets, add_special_tokens=False)  # без спецтокенов для целей

    # Формирование полных последовательностей [контекст + ответ + EOS]
    for i in range(batch_size):
        # Соединяем контекст и ответ
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids

        # Маска для loss: -100 для контекста, реальные id для ответа
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    # Дополнение до max_length и обрезка
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]

        # Добавляем паддинг слева
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids

        # Обрезаем до max_length
        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length]
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length]
        labels["input_ids"][i] = labels["input_ids"][i][:max_length]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Применяем предобработку ко всему датасету
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,        # обработка батчами
    num_proc=1,          # количество процессов
    remove_columns=dataset["train"].column_names,  # удаляем исходные колонки
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

# Итоговый тренировочный датасет
train_dataset = processed_datasets["train"]

README.md:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

data%20%281%29.csv:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1233 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/986 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/247 [00:00<?, ? examples/s]

In [12]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 986
})

In [23]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True
)

In [24]:
next(iter(train_dataloader))

{'input_ids': tensor([[151667, 151667, 151667,  ..., 151682, 151666, 151666],
         [151667, 151667, 151667,  ..., 151682, 151666, 151666],
         [151667, 151667, 151667,  ..., 151682, 151666, 151666],
         ...,
         [151667, 151667, 151667,  ..., 151682, 151666, 151666],
         [151667, 151667, 151667,  ..., 151682, 151666, 151666],
         [151667, 151667, 151667,  ..., 151682, 151666, 151666]]),
 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 'labels': tensor([[  -100,   -100,   -100,  ..., 151682, 151666, 151666],
         [  -100,   -100,   -100,  ..., 151682, 151666, 151666],
         [  -100,   -100,   -100,  ..., 151682, 151666, 151666],
         ...,
         [  -100,   -100,   -100,  ..., 151682, 151666, 151666],
         [  -100,   -100,   -100,  ..., 151682, 15166

In [25]:
tokenizer.decode(train_dataset[0]["input_ids"])

'<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad

# Train the model

In [26]:
import wandb
wandb.init(mode="disabled")

In [27]:
training_args = TrainingArguments(
    output_dir="qwen_lora_clm_with_added_tokens",
    num_train_epochs=5.0,
    save_total_limit=5,
    per_device_train_batch_size=4,
    warmup_steps=10,
    weight_decay=0.0001,
    dataloader_drop_last=True,
    bf16=True,
    logging_steps=10,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    remove_unused_columns=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
)
# model.config.use_cache = False
trainer.train()

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,4.9204
20,3.6979
30,3.5812
40,2.8253
50,2.4466
60,2.1484
70,1.9007
80,1.6993
90,1.6627
100,1.5325




TrainOutput(global_step=1230, training_loss=0.636938241439137, metrics={'train_runtime': 766.051, 'train_samples_per_second': 6.436, 'train_steps_per_second': 1.606, 'total_flos': 4.404571797454848e+16, 'train_loss': 0.636938241439137, 'epoch': 5.0})

In [28]:
trainer.save_model("qwen_lora_clm_with_added_tokens")
tokenizer.save_pretrained("qwen_lora_clm_with_added_tokens")



('qwen_lora_clm_with_added_tokens/tokenizer_config.json',
 'qwen_lora_clm_with_added_tokens/special_tokens_map.json',
 'qwen_lora_clm_with_added_tokens/vocab.json',
 'qwen_lora_clm_with_added_tokens/merges.txt',
 'qwen_lora_clm_with_added_tokens/added_tokens.json',
 'qwen_lora_clm_with_added_tokens/tokenizer.json')

# Check the model output on a sample from evaluation dataset

In [29]:
import random


device = 'cuda'

# Выбираем случайный пример из тестового набора данных
i = random.randint(0, len(dataset["test"]))
context = dataset["test"][i]["context"]

batch = tokenizer(context, return_tensors="pt")
batch = {k: v.to(device) for k, v in batch.items()}


model.eval()
output_tokens = model.generate(
    **batch,
    max_new_tokens=256,
    do_sample=True,  # Включаем стохастическую генерацию (не greedy)
    temperature=0.2,  # Параметр "температуры" (меньше = более предсказуемые ответы)
    top_p=0.95,  # Параметр nucleus sampling (отсекаем маловероятные варианты)
    top_k=50,  # Ограничиваем выбор топ-50 наиболее вероятных токенов
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split("<|endcontext|>")[1]
target = dataset["test"][i]["target"]

# Выводим сравнение сгенерированного и реального ответа
print(f"Контекст: {context} \n\n Сгенерированный ответ: {target_predicted} \n\n Ожидаемый ответ: {target}")

Контекст: <|begincontext|><|user|>I am looking for a place to eat. I don't have a price preference.<|system|>What city should I search in?<|user|>Show me places to eat in San Francisco. I am looking for something afforadable priced.<|system|>What kind of food would you like to eat.<|user|>Some Pizza and Pasta would be perfect.<|system|>I found 10 restaurants that serve pizza and pasta. 54 Mint Ristorante Italiano is a nice restaurant located in San Francisco.<|user|>Yes, that works for me.<|system|>Would you like to reserve a table?<|user|>Yes, I would like to make a reservation. Make a reservation for Sunday this week for 4 people.<|system|>What time is the reservation for?<|user|>Mate it at 17:30.<|system|>Please confirm the following details: You are booking a table at 54 Mint Ristorante Italiano, the city is San Francisco, the reservation is at 5:30 pm, the date is the day after tomorrow, and the reservation is for 4 people.<|user|>No, I want it on Thursday next week.<|system|>Plea

# Save the Adapter model

When the lora layers are applied to embedding layers, the corresponding base model embedding layers are also saved.

# Check the model loading is working as expected and generating plausible outputs.

In [30]:
from peft import PeftModel

inference_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,  # Оптимизация использования памяти
    # use_flash_attention_2=True,
)

# Подгоняем размер эмбеддингов под наш токенизатор
inference_model.resize_token_embeddings(len(tokenizer))

# Загружаем адаптированные веса LoRA поверх базовой модели
inference_model = PeftModel.from_pretrained(
    inference_model,
    "qwen_lora_clm_with_added_tokens"
)

inference_model.to(device)
inference_model.eval()

output_tokens = inference_model.generate(
    **batch,
    max_new_tokens=256,
    do_sample=True,  # Используем стохастическую генерацию
    temperature=0.2,  # Контроль случайности (меньше = более предсказуемо)
    top_p=0.95,  # Nucleus sampling - отсекаем маловероятные варианты
    top_k=50,  # Ограничиваем выбор топ-50 наиболее вероятных токенов
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

target_predicted = tokenizer.decode(
    output_tokens[0],
    skip_special_tokens=False
).split("<|endcontext|>")[1]  # Извлекаем только ответ (после маркера контекста)


print(f"Контекст: {context} \n\n Сгенерированный ответ: {target_predicted} \n\n Ожидаемый ответ: {target}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Контекст: <|begincontext|><|user|>I am looking for a place to eat. I don't have a price preference.<|system|>What city should I search in?<|user|>Show me places to eat in San Francisco. I am looking for something afforadable priced.<|system|>What kind of food would you like to eat.<|user|>Some Pizza and Pasta would be perfect.<|system|>I found 10 restaurants that serve pizza and pasta. 54 Mint Ristorante Italiano is a nice restaurant located in San Francisco.<|user|>Yes, that works for me.<|system|>Would you like to reserve a table?<|user|>Yes, I would like to make a reservation. Make a reservation for Sunday this week for 4 people.<|system|>What time is the reservation for?<|user|>Mate it at 17:30.<|system|>Please confirm the following details: You are booking a table at 54 Mint Ristorante Italiano, the city is San Francisco, the reservation is at 5:30 pm, the date is the day after tomorrow, and the reservation is for 4 people.<|user|>No, I want it on Thursday next week.<|system|>Plea