In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from huggingface_hub import notebook_login
from typing import List, Union, Any, Dict
import json
from datasets import Dataset

In [2]:
def prepare_dataset(path: str):
    with open(path, "r") as f:
        sessions = json.load(f)

    final_sessions = []
    for session in sessions:
        session_str = "\n".join([f"<|im_start|>{msg['author']}\n{msg['text']}<|im_end|>" for msg in session])
        final_sessions.append(session_str)

    return final_sessions

In [3]:
class DataCollatorForLanguageModelingChatML(DataCollatorForLanguageModeling):
    """
    Data collator for [ChatML](https://github.com/openai/openai-python/blob/main/chatml.md) format, like:
    ```
    <|im_start|>Alexander Smirnov
    >>> здарова нормик<|im_end|>
    <|im_start|>Федук
    >>> чо ты куда пропал то<|im_end|>
    <|im_start|>Alexander Smirnov
    >>> вечером сделаю бота
    >>> сегодня еще дедлайн по домке<|im_end|>
    ```
    Reference data collator implementation: [DataCollatorForCompletionOnlyLM](https://github.com/huggingface/trl/blob/main/trl/trainer/utils.py#L56)
    
    Args:
        mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying
            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
             for flexibility and backwards-compatibility.
        ignore_index (`int`, *optional*, defaults to `-100`):
            The index to use to ignore the initial tokens with
    """

    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index
        self.start_token = self.tokenizer.encode("<|im_start|>", add_special_tokens=False)[0]
        self.end_token = self.tokenizer.encode("<|im_end|>", add_special_tokens=False)[0]
        self.new_line_token = self.tokenizer.encode("\n", add_special_tokens=False)[-1]
        self.bos_token = self.tokenizer.bos_token_id

        print(self.start_token)
        print(self.end_token)
        print(self.new_line_token)

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            if_start = False
            for j in range(len(batch["labels"][i])):

                token = batch["labels"][i][j].item()
                
                if token == self.start_token:
                    if_start = True

                if if_start or token == self.end_token or token == self.bos_token:
                    batch["labels"][i][j] = self.ignore_index
                    
                if token == self.new_line_token:
                    if_start = False

        return batch

In [6]:
base_model = "ehartford/dolphin-2.2.1-mistral-7b" #someone13574/Mistral-7B-v0.1-sharded

#model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)
dataset = Dataset.from_dict({"session": prepare_dataset("./data/messages.json")})

data_collator = DataCollatorForLanguageModelingChatML(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


32001
32000
13


In [5]:
print(dataset[2500]["session"])

<|im_start|>Alexander Smirnov
>>> здарова нормик<|im_end|>
<|im_start|>Федук
>>> чо ты куда пропал то<|im_end|>
<|im_start|>Alexander Smirnov
>>> вечером сделаю бота
>>> сегодня еще дедлайн по домке<|im_end|>
<|im_start|>Федук
>>> ок, на один глаз залетишь на умник?<|im_end|>
<|im_start|>Alexander Smirnov
>>> на наш залечу<|im_end|>
<|im_start|>Федук
>>> кк<|im_end|>


In [7]:
collator_res = data_collator([tokenizer(dataset["session"][2500], return_tensors="pt")["input_ids"][0]])

In [8]:
print(tokenizer.decode(collator_res["labels"][0][collator_res["labels"][0] != -100]))

>>> здарова нормик 
>>> чо ты куда пропал то 
>>> вечером сделаю бота
>>> сегодня еще дедлайн по домке 
>>> ок, на один глаз залетишь на умник? 
>>> на наш залечу 
>>> кк
