### Ставим зависимости

In [None]:
!pip install jupyterlab_widgets ipywidgets -q
!sudo apt-get install git -y
!sudo apt-get install git-lfs -y
!git clone https://github.com/IlyaGusev/rulm.git
!git reset --hard 3bc0cd6700708c84ee444005f9e21c8b36230937
!git clean -df
!pip install -r ./rulm/requirements.txt -q
!pip uninstall wandb -y

### Загружаем базовую модель

In [None]:
%cd ./rulm/self_instruct

In [None]:
BASE_MODEL_HG_NAME = r'lmsys/vicuna-13b-v1.5'
BASE_MODEL_LOCAL_PATH = r'models/vicuna-13b-v1.5'

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id=BASE_MODEL_HG_NAME, 
    local_dir=BASE_MODEL_LOCAL_PATH, 
    ignore_patterns=["LICENSE", "README.md", "*.safetensors"]
)

### Фиксим конфиги токенизатора
https://github.com/IlyaGusev/rulm/blob/master/self_instruct/README.md#fix-tokenizer

In [None]:
import json

replacements = {
    "tokenizer_config.json": {
        "tokenizer_class": "LlamaTokenizer",
        "model_max_length": 4096,
        "padding_side": "left",
        "bos_token": "<s>",
        "eos_token": "</s>",
        "pad_token": "<unk>",
        "unk_token": "<unk>",
        "clean_up_tokenization_spaces": False,
        "special_tokens_map_file": "special_tokens_map.json"
    },
    "special_tokens_map.json": {
        "bos_token": "<s>",
        "eos_token": "</s>",
        "pad_token": "<unk>",
        "unk_token": "<unk>"
    }
}

for filename, new_content in replacements.items():
    with open(f'{BASE_MODEL_LOCAL_PATH}/{filename}', 'w', encoding='utf-8') as fp:
        json.dump(new_content, fp, indent=4)

In [None]:
!cp /home/rulm/self_instruct/models/vicuna-13b-v1.5/pytorch_model-00001-of-00003.bin /home/rulm/self_instruct/models/directum_13b/pytorch_model-00001-of-00003.bin
!cp /home/rulm/self_instruct/models/vicuna-13b-v1.5/pytorch_model-00002-of-00003.bin /home/rulm/self_instruct/models/directum_13b/pytorch_model-00002-of-00003.bin
!cp /home/rulm/self_instruct/models/vicuna-13b-v1.5/pytorch_model-00003-of-00003.bin /home/rulm/self_instruct/models/directum_13b/pytorch_model-00003-of-00003.bin

### Удаляем системный промпт

Без удаления системного промпта модель будет работать только с ним (либо работать плохо без него)

In [None]:
internal_prompts = {
    "system_prompt": "",
    "system_message_template": "",
    "user_message_template": "<s>{role}\n{content}</s>\n",
    "bot_message_template": "<s>{role}\n{content}</s>\n",
    "user_role": "user",
    "bot_role": "bot",
    "system_role": "system",
    "suffix": "<s>bot"
}

with open(f'./internal_prompts/saiga_v2.json', 'w', encoding='utf-8') as fp:
    json.dump(internal_prompts, fp, indent=4)

### Отключаем wandb и загрузку модели в 8 битах

В файле **rulm/self_instruct/src/train.py** закомментировать строки 6, 245, 246

В файле **rulm/self_instruct/src/train.py** на 104 строке изменить значение  **report_to: str = 'wandb'** на **report_to: str = None**

In [None]:
with open(f'./configs/saicuna_13b.json', 'r', encoding='utf-8') as fp:
    model_config = json.loads(fp.read())
    
model_config['load_in_8bit'] = False

with open(f'./configs/saicuna_13b.json', 'w', encoding='utf-8') as fp:
    json.dump(model_config, fp, indent=4)

### Фиксим двойной EOS-токен в конце промпта обучения

В файле **rulm/self_instruct/src/dataset.py** на 34 строке изменить значение **self.add_global_eos** на **False**

В файле **rulm/self_instruct/src/dataset.py** заменить цикл

```
for message, role in conversation.iter_messages():
    message_input_ids = self.get_tokens(message)
    message_labels = message_input_ids
    if len(input_ids) + len(message_input_ids) > self.max_tokens_count:
        break

    labels_mask = [self.labels_pad_token_id for _ in range(len(message_input_ids))]
    if role != conversation.bot_role and self.only_target_loss:
        message_labels = labels_mask

    input_ids.extend(message_input_ids)
    labels.extend(message_labels)
```

на

```
conv = []
        
for message, role in conversation.iter_messages():
    conv.append([role, message])

conv[-1][1] = conv[-1][1].strip()

for msg in conv:
    message_input_ids = self.get_tokens(msg[1])
    message_labels = message_input_ids
    if len(input_ids) + len(message_input_ids) > self.max_tokens_count:
        break

    labels_mask = [self.labels_pad_token_id for _ in range(len(message_input_ids))]
    if msg[0] != conversation.bot_role and self.only_target_loss:
        message_labels = labels_mask

    input_ids.extend(message_input_ids)
    labels.extend(message_labels)
```

### Загрузка датасета

Загрузи **train.jsonl** и **val.jsonl** в **/rulm/self_instruct**

Формат датасета (каждая беседа на новой строке, согласно формату jsonl):

```
{"messages": [{"role": "user", "content": "Как дела?"}, {"role": "bot", "content": "Отлично"}], "source": "alpaca"}
{"messages": [{"role": "user", "content": "Кто ты?"}, {"role": "bot", "content": "Я бот"}], "source": "alpaca"}
```

### Обучение

In [None]:
!python3 -m src.train --config-file configs/saicuna_13b.json --train-file train.jsonl --val-file val.jsonl  --output-dir models/directum_13b

### Исправляем конфиг инференса обученной модели

In [None]:
!cp ./models/vicuna-13b-v1.5/generation_config.json  ./models/directum_13b/generation_config.json 

In [None]:
with open('./models/vicuna-13b-v1.5/generation_config.json', 'w') as fp:
    json.dump({
        "pad_token_id": 0,
        "bos_token_id": 1,
        "eos_token_id": 2,
        "temperature": 0.2,
        "top_p": 0.9,
        "top_k": 40,
        "do_sample": True,
        "max_new_tokens": 2560,
        "repetition_penalty": 1.1,
        "no_repeat_ngram_size": 15,
    }, fp, indent=4)

### Сливаем адаптер в модель

In [None]:
import torch

from peft import PeftModel
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_LOCAL_PATH,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map='cuda:0',
    local_files_only=True
)
model = PeftModel.from_pretrained(
    model,
    './models/directum_13b',
    torch_dtype=torch.float16,
    device_map='cuda:0',
    local_files_only=False
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained('merged_model')