In [None]:
import os
os.environ.update({'CUDA_VISIBLE_DEVICES': '0'})

In [None]:
! pip install peft
! pip install jsonlines
! pip install accelerate
! pip install bitsandbytes
! pip install trl

# Загружаем модель и токенизатор

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_path = "openlm-research/open_llama_3b_v2"

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    use_cache=False,
    quantization_config=bnb_config
)

In [None]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model.enable_input_require_grads()

peft_config = LoraConfig(
    r=1,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

# Попробуем что-нибудь сгенерировать

In [None]:
# model.cuda()
model.eval()

In [None]:
%%time
from transformers import GenerationConfig

prompt = '### Question: Describe what summer means to you in one sentence.\n\n### Answer:'
tokens = tokenizer(prompt, return_tensors='pt')

output = model.generate(
    inputs=tokens['input_ids'].cuda(),
    generation_config=GenerationConfig(
        max_new_tokens=512,
        do_sample=True,
        temperature=0.5,
        top_k=40,
        top_p=0.8
    )
)

print(tokenizer.decode(output[0][tokens['input_ids'].shape[-1]:]).strip())

# Готовим датасет для обучения и валидации

In [None]:
from datasets import load_dataset
dataset = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned")

In [None]:
def process(row):
    row["prompt"] = f'### Question: {row["prompt"].strip()}\n\n### Answer:'
    row["chosen"] = row["chosen"][-1]['content'].strip()
    row["rejected"] = row["rejected"][-1]['content'].strip()
    return row

In [None]:
dataset = dataset.map(process)

In [None]:
train_dataset = dataset['train'].select(range(64))

In [None]:
from transformers import TrainingArguments
from trl import DPOTrainer

In [None]:
train_args = TrainingArguments(
    output_dir='./output',
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_steps=1,
    save_strategy="no",
    report_to="none",
    warmup_ratio=0.0,
    evaluation_strategy="no",
    eval_steps=8,
    remove_unused_columns=False,
    gradient_checkpointing=True
)

In [None]:
trainer = DPOTrainer(
    model,
    args=train_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
)

In [None]:
! nvidia-smi

In [None]:
trainer.train()