# Методы дообучения Больших Языковых Моделей.


**Credits: Данный ноутбук основан на наработках курса NLP от ШАД Яндекса** [yandexdataschool/nlp_course](https://github.com/yandexdataschool/nlp_course)

In [1]:
%pip install --upgrade transformers accelerate sentencepiece optimum peft bitsandbytes

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from tqdm.auto import tqdm, trange

assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [28]:
model_name = 'Enoch/llama-7b-hf'

# loading Llama tokenizer ...
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id

# ... and the model itself
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map='auto', low_cpu_mem_usage=True, offload_state_dict=True,
    load_in_4bit=True, torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad=False

In [11]:
model.gradient_checkpointing_enable()  # only store a small subset of activations, re-compute the rest.
model.enable_input_require_grads()  # override an implementation quirk in gradient checkpoints that disables backprop unless inputs require grad


### Шаг №1: Prompt tuning

![img](https://i.imgur.com/Ux3qQAu.png)

source: theodd1souts.fandom.com

In [12]:
prompt = "A quick brown fox"
batch = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(device)

for i in range(10):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch["input_ids"] = torch.cat([batch["input_ids"], next_token], dim=-1)
    batch["attention_mask"] = torch.cat(
        [batch["attention_mask"], torch.ones_like(next_token)], dim=-1
    )

print("\nOutput:", tokenizer.decode(batch["input_ids"][0].cpu().numpy().tolist()))


Output: <s>A quick brown fox jumps over the lazy dog.
A quick


In [13]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors="pt", return_token_type_ids=False).to(
    device
)
outputs = model(**batch)

next_word_logits = outputs.logits[:, :-1]
true_next_tokens = batch["input_ids"][:, 1:]
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))

print("Loss:", loss)

Loss: tensor(3.0729, device='cuda:0', grad_fn=<NllLossBackward0>)


Воспользуемся механизмом prompt-tuning чтобы модель отвечала "no dog was jumped over today" на запросы. Статья о [prompt tuning](https://arxiv.org/abs/2104.08691).

![img](https://i.imgur.com/VwNNKnb.png)


In [14]:
class WordEmbeddingsWithLearnedPrompts(nn.Module):
    """
    To perform prompt tuning, you will need to replace model's original word embeddings with a layer - THIS layer
     - that inserts trainable prompts instead of the first N token embeddings."""

    def __init__(self, word_embeddings: nn.Embedding, num_prompts: int):
        super().__init__()
        self.original_word_embeddings = word_embeddings
        self.num_prompts = num_prompts
        self.learnable_prompts = nn.Parameter(
            torch.randn(1, num_prompts, word_embeddings.embedding_dim),
            requires_grad=True,
        )

    def forward(self, input_ids: torch.LongTensor):
        # input_ids shape: [batch_size, seq length]
        assert input_ids.dtype == torch.int64
        assert input_ids.shape[1] > self.num_prompts
        assert torch.all(
            input_ids[:, : self.num_prompts] == tokenizer.pad_token_id
        ).item(), "don't forget to prepend several BOS tokens to input_ids"

        # Your task: embed input_ids, but replace the first :num_prompts: tokens with self.learnable_prompts
        # This is because we will prepend :num_prompts: padding tokens at the beginning

        # After you are done, you must produce a word embedding vector for each token in input_ids,
        # except that the first :num_prompts: vectors should equal learnable_prompts;
        # any additional vectors after first :num_prompts: ones should be embedded as usual
        # Note: since you're dealing with trainable params, please torch.cat instead of item assignment

        # <YOUR CODE HERE>
        output = torch.cat(
            [
                self.learnable_prompts,
                self.original_word_embeddings(input_ids[:, self.num_prompts :]),
            ],
            dim=1,
        )

        return output  # your_outputs_with_prompts_as_per_instructions_above

In [15]:
num_prompts = 16
test_emb_layer = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=num_prompts).to(device)
test_input_ids = tokenizer("a cat say on a may", return_tensors='pt')['input_ids'].to(device)

space_for_prompts = torch.full([len(test_input_ids), num_prompts], fill_value=tokenizer.pad_token_id,
                               dtype=torch.int64, device=device)
test_inputs_with_prompts = torch.cat([space_for_prompts, test_input_ids], dim=1)

with torch.cuda.amp.autocast():
  test_prompt_embeddings = test_emb_layer(test_inputs_with_prompts)

assert test_prompt_embeddings.shape[:2] == test_inputs_with_prompts.shape
assert test_prompt_embeddings.shape[-1] == model.config.hidden_size
assert torch.allclose(test_prompt_embeddings[:, :num_prompts], test_emb_layer.learnable_prompts.float())
assert torch.allclose(test_prompt_embeddings[:, num_prompts:], model.model.embed_tokens(test_input_ids).float())
print("Looks legit!")

Looks legit!


  with torch.cuda.amp.autocast():


__Работает!__ Давайте посмотрим на результаты.

In [16]:
assert isinstance(model.model.embed_tokens, nn.Embedding), "you have already replaced the embedding layer. If the replacement is broken, please reload the model"

model.model.embed_tokens = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=num_prompts).to(device)

opt = torch.optim.Adam([model.model.embed_tokens.learnable_prompts], lr=0.01)

In [21]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)
space_for_prompts = torch.full([len(test_input_ids), num_prompts], fill_value=tokenizer.pad_token_id,
                               dtype=torch.int64, device=device)
batch['input_ids'] = torch.cat([space_for_prompts, batch['input_ids']], dim=1)
batch['attention_mask'] = torch.cat([torch.ones_like(space_for_prompts), batch['attention_mask']], dim=1)

outputs = model(**batch)
next_word_logits = outputs.logits[:, num_prompts : -1, :]
true_next_tokens = batch['input_ids'][:, num_prompts + 1:]
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
print("Loss:", loss)

Loss: tensor(7.4865, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
# checking if the model can learn. Change max_steps for proper training
import datasets

data = datasets.load_dataset("Abirate/english_quotes", split="train[:32]")  # 32 lines
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [23]:
from tqdm.auto import tqdm

In [24]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors="pt", return_token_type_ids=False).to(
    device
)
space_for_prompts = torch.full(
    [len(test_input_ids), num_prompts],
    fill_value=tokenizer.pad_token_id,
    dtype=torch.int64,
    device=device,
)
batch["input_ids"] = torch.cat([space_for_prompts, batch["input_ids"]], dim=1)
batch["attention_mask"] = torch.cat(
    [torch.ones_like(space_for_prompts), batch["attention_mask"]], dim=1
)

for _ in tqdm(range(100)):
    outputs = model(**batch)
    next_word_logits = outputs.logits[:, num_prompts:-1, :]
    true_next_tokens = batch["input_ids"][:, num_prompts + 1 :]
    loss = F.cross_entropy(
        next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1)
    )
    print("Loss:", loss)
    loss.backward()
    opt.step()
    opt.zero_grad()

    if loss.item() <= 0.1:
        break
# raise NotImplemented("Your task: iteratively train the model to reduce loss using prompt optimizer (opt)")


assert loss.item() <= 0.1
print("Good job!")

  0%|          | 0/100 [00:00<?, ?it/s]

Loss: tensor(7.4865, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(6.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(6.3117, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(5.7771, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(5.3190, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(4.9438, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(4.6262, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(4.3398, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(4.0716, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(3.8189, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(3.5819, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(3.3579, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(3.1411, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(2.9275, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tensor(2.7184, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss: tens

In [25]:
prompt = "A quick brown fox"
batch = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(device)
batch["input_ids"] = torch.cat([space_for_prompts, batch["input_ids"]], dim=1)
batch["attention_mask"] = torch.cat(
    [torch.ones_like(space_for_prompts), batch["attention_mask"]], dim=1
)


for i in range(15):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch["input_ids"] = torch.cat([batch["input_ids"], next_token], dim=-1)
    batch["attention_mask"] = torch.cat(
        [batch["attention_mask"], torch.ones_like(next_token)], dim=-1
    )

print(
    "\nOutput:",
    tokenizer.decode(batch["input_ids"][0, num_prompts:].cpu().numpy().tolist()),
)

# if you did everything right, the model will deny that the fox jumped over the lazy dog


Output: <s>A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it


In [None]:
# 1. Invert the words order

### Шаг 1.1 (опциональный): HuggingFace PEFT

HuggingFace также предоставил широко применимый инструмент для дообучения: [`peft`](https://huggingface.co/docs/peft/index). Многие современные техники: prompt-tuning, LoRA и другие.



In [26]:
import peft

assert isinstance(model.model.embed_tokens, nn.Embedding), "please reload the model"

peft_config = peft.PromptTuningConfig(
    task_type=peft.TaskType.CAUSAL_LM, num_virtual_tokens=16
)
model = peft.get_peft_model(
    model, peft_config
)  # note: for most peft methods, this line also modifies model in-place
print(
    "Trainable parameters:",
    sum(p.numel() for p in model.parameters() if p.requires_grad),
)
print(
    "Total parameters (excluding quantization):",
    sum(p.numel() for p in model.parameters()),
)

AssertionError: please reload the model

In [None]:
# Your task: optimize the PEFT-wrapped model to achieve next token prediction loss < 0.1, but this time using PEFT
# Please note: you no longer need to prepend PAD tokens, but you still need to skip :num_virtual_tokens: first logits.
# Finally, generate the sentence to make sure that the model learned the truth.

In [None]:
# Feel free to structure your code as you see fit - as long as it's legible :)

### Шаг 2: LoRA

При дообучении для более серьезных задач можно обратиться к линейной алгебре и вспомнить о __ранге матрицы__. Низкоранговые адаптеры на основе матричного разложения описаны в [статье о LoRA](https://arxiv.org/pdf/2106.09685.pdf).

Основная идея заключается в добавлении низкоранговых адаптеров параллельно с существующими линейными слоями:
<center><img src="https://i.imgur.com/6bQLNiG.png" width=240px></center>

В оригинальной статье по LoRA адаптеры добавлялись только к матрицам внимания. Тем не менее, [новые работы](https://arxiv.org/abs/2305.14314) показывают, что также полезно применять их и к полносвязным частям.

Для начала реализуем базовый слой LoRA.

In [None]:
# re-load the model to remove any previous PEFT tuners
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    low_cpu_mem_usage=True,
    offload_state_dict=True,
    load_in_4bit=True,
    torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad = False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [None]:
class LoRALayer(nn.Module):
    """Wraps a linear layer with LoRA-like adapter. Wraps an existing OPT linear layer"""

    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module  # pre-trained (frozen) linear layer
        self.adapter_A = nn.Parameter(
            torch.empty(module.in_features, rank, device=module.weight.device)
        )
        nn.init.kaiming_uniform_(self.adapter_A, a=5**0.5)
        self.adapter_B = nn.Parameter(
            torch.zeros(rank, module.out_features, device=module.weight.device)
        )

    def forward(self, input):
        # Apply self.module and LoRA adapter, return the sum (self.module outputs + adapter outputs)
        #  <YOUR CODE HERE>
        return self.module(input) + torch.matmul(
            torch.matmul(input, self.adapter_A), self.adapter_B
        )

In [None]:
# test your implementation
test_linear = nn.Linear(128, 128)
test_linear.weight.data[...] = torch.eye(128)
test_adapter = LoRALayer(test_linear, rank=8)

assert torch.allclose(
    test_adapter(torch.ones(1, 1, 128)), test_linear.bias + 1
), "please check your forward pass"

test_adapter.adapter_A.data[...] = torch.linspace(0.1, -0.5, 128 * 8).view(128, 8)
test_adapter.adapter_B.data[...] = torch.linspace(0.5, -0.1, 128 * 8).view(8, 128)
test_linear.bias.data[...] = torch.linspace(1.0, -1.0, 128)

dummy_loss = F.mse_loss(
    test_adapter(torch.ones(1, 128) / 128).squeeze(), torch.linspace(-1, 1, 128)
)
assert torch.allclose(dummy_loss, torch.tensor(1.3711389), rtol=0, atol=1e-4)
dummy_loss.backward()
assert all(
    w.grad is not None for w in [test_adapter.adapter_A, test_adapter.adapter_B]
), "some adapter weights have no grad"
assert torch.allclose(
    test_adapter.adapter_A.grad.sum(), torch.tensor(-0.60158), rtol=0, atol=1e-4
), "bad grad w.r.t. A"
assert torch.allclose(
    test_adapter.adapter_B.grad.sum(), torch.tensor(0.9931), rtol=0, atol=1e-4
), "bad grad w.r.t. B"
# note: bad grad means that your code is different from LoRA paper OR that your code is not autograd-friendly (e.g. no_grad)
del dummy_loss, test_linear, test_adapter
print("All tests passed!")

All tests passed!


Ниже приведен код, который применяет адаптер LoRA к линейным слоям Q/K/V внимания модели. Модифицировать можно и другие слои:
* self_attn.o_proj
* mlp.up_proj, mlp.gate_proj, mlp.down_proj
* lm_head

In [None]:
lora_rank = 8

for name, module in model.model.layers.named_modules():
    if "LlamaDecoderLayer" in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(
            device
        )
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(
            device
        )
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(
            device
        )

assert (
    sum(isinstance(module, LoRALayer) for module in model.modules()) == 96
)  # for Llama-7B

In [None]:
batch = tokenizer(
    "This model wants to share its greatest secret:",
    return_tensors="pt",
    return_token_type_ids=False,
)
# test a single training step, make sure we get meaningful gradients
with torch.cuda.amp.autocast(dtype=torch.float32):
    out = model.forward(**batch)
    (out.logits.norm() / 100).backward()

for i, module in enumerate(model.modules()):
    if isinstance(module, LoRALayer):
        assert module.adapter_B.grad is not None
        assert module.adapter_B.grad.norm().item() > 0

model.zero_grad(set_to_none=True)
print("Grad check successful, well done!")

  with torch.cuda.amp.autocast(dtype=torch.float32):


Grad check successful, well done!


Приведенный ниже пример показывает, как обучить адаптеры LoRA на небольшом наборе данных.

In [None]:
# checking if the model can learn. Change max_steps for proper training
import datasets

data = datasets.load_dataset("Abirate/english_quotes", split="train[:32]")  # 32 lines
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

README.md:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [None]:
PEFT
Parameter
Efficient
Fine
Tunin

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=250,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        report_to=None,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

# NOTE: this is just an example! you do not have to wait for this progressbar to finish :)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.2714
2,0.3776
3,1.4694
4,1.4272
5,0.8513
6,1.6487
7,1.8406
8,1.2712
9,0.5603
10,1.2858




UnboundLocalError: local variable 'active_adapters' referenced before assignment

In [None]:
prompt = "Если где-то тебе не рады в рваных носках "
batch = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to(device)
for i in range(15):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch["input_ids"] = torch.cat([batch["input_ids"], next_token], dim=-1)
    batch["attention_mask"] = torch.cat(
        [batch["attention_mask"], torch.ones_like(next_token)], dim=-1
    )

print(
    "\nOutput:",
    tokenizer.decode(batch["input_ids"][0, :].cpu().numpy().tolist()),
)

# if you did everything right, the model will deny that the fox jumped over the lazy dog


Output: <s>Если где-то тебе не рады в рваных носках ходить, то в этом году нашлось новое облепие


In [None]:
1. Развернуть фразу на входе (на уровне слов) с помощью p-tune
2. Сделать то же самое с помощью библиотеки peft от HF
3. Дообучить с помощью LoRA одну из моделей (лучше gemma:2b или phi3.5, т.к. они небольшие) делать что-то прикольное на ваш выбор


## Шаг 3: Дополнительное задание, *фактическое* обучение модели

Ваша задача - дообучить модель для _генерации кода на Python_. Пожалуйста, используйте вышеприведенные примеры в качестве вдохновения. Например:

* dataset: используйте [codeparrot-clean](https://huggingface.co/datasets/codeparrot/codeparrot-clean) или любые другие данные, содержащие код на Python. Так как вам не нужно много данных для этого упражнения, достаточно использовать только более короткий набор данных для валидации codeparrots.
* предобработка: выберите код на Python на основе расширений файлов (.py) (можно пропустить в случае codeparrot - 100% этого датасета – Python)
* короткие строки: используйте первые 512 символов каждой строки
* тип адаптера: используйте LoRA, плюс как минимум один из:
   - дополнительный адаптер на lm_head
   - дополнительный адаптер на компоненты MLP (mlp.*)
   - обучаемые входные эмбеддинги (требуется настройка использования памяти)

* обучение: вам не обязательно обучать до сходимости. Если все пройдет хорошо, ваша модель должна начать генерировать код после 500 шагов. Пожалуйста, используйте batch size не менее 4 (4 x 1 x 512 токенов) с использованием gradient_accumulation_steps=4.

Примечание: в библиотеке peft также есть реализация LoRA. Однако мы просим вас показать хотя бы один полный запуск обучения с вашим собственным кодом LoRA для этого задания.

Альтернативное задание: Вместо написания кода на Python, вы можете заменить задачу любым другим набором данных, например, вашим любимым исполнителем или подкастом, при условии, что это этично. Если вы выберете собственную задачу, пожалуйста, покажите примеры того, что ваша модель выучила - или не выучила, аналогично приведенным ниже примерам кода.

In [None]:
prompts = [
    "",
    "import",
    "from",
    "while",
    "try",
    "if",
    "for",
    "torch",
]  # feel free to add a few more that are not 100% assiciated with Python

# <A WHOLE LOT OF YOUR CODE>
# generate baseline samples with the selected prompts before finetuning
# please feel free to use transformers.Trainer (as above) or your custom training code
# after the training concludes, please show examples of text generated by your model. It is expected to look like Python code fragments
# print the generation examples nicely (suggestion: use pandas or HTML) for easier comparison
# note: your LoRA-enhanced model can run generation the same way as the non-trained model (above)

In [None]:
# This template helps to compare generated code samples in pretty table form
# feel free to present your work in other forms

from IPython.display import HTML, display

table_template = """<table style="border:1px solid black" >
  <tr>
    <th style="text-align: center; border:1px solid black">PROMPT</th>
    <th style="text-align: center; border:1px solid black">BEFORE</th>
    <th style="text-align: center; border:1px solid black">AFTER</th>
  </tr>
{}
</table>"""

row_template = """  <tr>
    <td style="width:20%; border:1px solid black"><pre align="left">`{}`</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
  </tr>"""

rows = []

for prompt in prompts:
    # replace placeholders in the format() arguments
    rows.append(
        row_template.format(
            prompt, "BEFORE FINETUNING", "TO BE GENERATED AFTER FINETUNING"
        )
    )

display(HTML(table_template.format("\n".join(rows))))

### Доп. материалы:

* How post-training quantization works: https://arxiv.org/abs/2208.07339
* An overview of running large models: https://huggingface.co/docs/accelerate/package_reference/big_modeling
* A general library for different adapter types: https://adapterhub.ml/


### P.s.
Приведенный выше код можно достаточно легко адаптировать ко многим современным и не очень моделям: [Falcon-7B](https://huggingface.co/tiiuae/falcon-7b), [OPT-6.7B](https://huggingface.co/facebook/opt-6.7b) or [BLOOM-7.1B](https://huggingface.co/bigscience/bloom-7b1).

Но вам может понадобиться изменить некоторые переменные:
1. Название модели для `AutoModelForCausalLM.from_pretrained()` и `AutoTokenizer`
2. Для prompt-tuning обратите внимание на `model.model.embed_tokens`.
3. Доработайте код для добавления LoRA. Сам адаптер не требует изменений.