In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from huggingface_hub import notebook_login
from typing import List, Union, Any, Dict
import json
from datasets import Dataset

In [2]:
def prepare_dataset(path: str):
    with open(path, "r") as f:
        sessions = json.load(f)

    final_sessions = []
    for session in sessions:
        session_str = "\n".join([f"<|im_start|>{msg['author']}\n{msg['text']}<|im_end|>" for msg in session])
        final_sessions.append(session_str)

    return final_sessions

In [3]:
class DataCollatorForLanguageModelingChatML(DataCollatorForLanguageModeling):
    """
    Data collator for [ChatML](https://github.com/openai/openai-python/blob/main/chatml.md) format, like:
    ```
    <|im_start|>Alexander Smirnov
    >>> здарова нормик<|im_end|>
    <|im_start|>Федук
    >>> чо ты куда пропал то<|im_end|>
    <|im_start|>Alexander Smirnov
    >>> вечером сделаю бота
    >>> сегодня еще дедлайн по домке<|im_end|>
    ```
    Reference data collator implementation: [DataCollatorForCompletionOnlyLM](https://github.com/huggingface/trl/blob/main/trl/trainer/utils.py#L56)
    
    Args:
        mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying
            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
             for flexibility and backwards-compatibility.
        ignore_index (`int`, *optional*, defaults to `-100`):
            The index to use to ignore the initial tokens with
    """

    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index
        self.start_token = self.tokenizer.encode("<|im_start|>", add_special_tokens=False)[0]
        self.end_token = self.tokenizer.encode("<|im_end|>", add_special_tokens=False)[0]
        self.new_line_token = self.tokenizer.encode("\n", add_special_tokens=False)[-1]
        self.bos_token = self.tokenizer.bos_token_id

        print(self.start_token)
        print(self.end_token)
        print(self.new_line_token)

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            if_start = False
            for j in range(len(batch["labels"][i])):

                token = batch["labels"][i][j].item()
                
                if token == self.start_token:
                    if_start = True

                if if_start or token == self.bos_token:
                    batch["labels"][i][j] = self.ignore_index
                    
                if token == self.new_line_token:
                    if_start = False

        return batch

In [4]:
base_model = "ehartford/dolphin-2.2.1-mistral-7b" #someone13574/Mistral-7B-v0.1-sharded

tokenizer = AutoTokenizer.from_pretrained(base_model)
dataset = Dataset.from_dict({"session": prepare_dataset("./data/messages.json")})

data_collator = DataCollatorForLanguageModelingChatML(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


32001
32000
13


In [5]:
print(dataset[2500]["session"])

<|im_start|>Alexander Smirnov
>>> здарова нормик<|im_end|>
<|im_start|>Федук
>>> чо ты куда пропал то<|im_end|>
<|im_start|>Alexander Smirnov
>>> вечером сделаю бота
>>> сегодня еще дедлайн по домке<|im_end|>
<|im_start|>Федук
>>> ок, на один глаз залетишь на умник?<|im_end|>
<|im_start|>Alexander Smirnov
>>> на наш залечу<|im_end|>
<|im_start|>Федук
>>> кк<|im_end|>


In [6]:
collator_res = data_collator([tokenizer(dataset["session"][2500], return_tensors="pt")["input_ids"][0]])

In [7]:
print(tokenizer.decode(collator_res["labels"][0][collator_res["labels"][0] != -100]))

>>> здарова нормик<|im_end|> 
>>> чо ты куда пропал то<|im_end|> 
>>> вечером сделаю бота
>>> сегодня еще дедлайн по домке<|im_end|> 
>>> ок, на один глаз залетишь на умник?<|im_end|> 
>>> на наш залечу<|im_end|> 
>>> кк<|im_end|>


In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
wandb.init(project='Fine tuning mistral 7B', job_type="training", anonymous="allow")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfuriousteabag[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

In [11]:
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 3,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 500,
    logging_steps= 10,
    logging_first_step=True,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb",
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=1024,
    dataset_text_field="session",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    data_collator=data_collator,
)

Map:   0%|          | 0/7982 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
trainer.train()
# Save the fine-tuned model
trainer.model.save_pretrained("finetuned-7b-telegram")
wandb.finish()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.3505
10,2.4403
20,2.4799
30,2.4946
40,2.481
50,2.3872
60,2.3576
70,2.346
80,2.4049
90,2.3762


VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,██▇▆▆▇▇▆▆▆▆▆▆▆▄▅▅▃▄▅▃▄▄▅▅▃▄▂▃▄▂▃▄▁▂▃▃▄▁▂
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,3.0
train/global_step,1497.0
train/learning_rate,0.0002
train/loss,1.6801
train/total_flos,3.035175370149888e+17
train/train_loss,1.97668
train/train_runtime,9151.9537
train/train_samples_per_second,2.616
train/train_steps_per_second,0.164
