In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from huggingface_hub import notebook_login
from typing import List, Union, Any, Dict
import json
from datasets import Dataset

In [2]:
def prepare_dataset(path: str):
    with open(path, "r") as f:
        sessions = json.load(f)

    final_sessions = []
    for session in sessions:
        session_str = "\n".join([f"<|im_start|>{msg['author']}\n{msg['text']}<|im_end|>" for msg in session])
        final_sessions.append(session_str)

    return final_sessions

In [3]:
class DataCollatorForLanguageModelingChatML(DataCollatorForLanguageModeling):
    """
    Data collator for [ChatML](https://github.com/openai/openai-python/blob/main/chatml.md) format, like:
    ```
    <|im_start|>Alexander Smirnov
    >>> здарова нормик<|im_end|>
    <|im_start|>Федук
    >>> чо ты куда пропал то<|im_end|>
    <|im_start|>Alexander Smirnov
    >>> вечером сделаю бота
    >>> сегодня еще дедлайн по домке<|im_end|>
    ```
    Reference data collator implementation: [DataCollatorForCompletionOnlyLM](https://github.com/huggingface/trl/blob/main/trl/trainer/utils.py#L56)
    
    Args:
        mlm (`bool`, *optional*, defaults to `False`): Whether or not to use masked language modeling in the underlying
            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
             for flexibility and backwards-compatibility.
        ignore_index (`int`, *optional*, defaults to `-100`):
            The index to use to ignore the initial tokens with
    """

    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index
        self.start_token = self.tokenizer.encode("<|im_start|>", add_special_tokens=False)[0]
        self.end_token = self.tokenizer.encode("<|im_end|>", add_special_tokens=False)[0]
        self.new_line_token = self.tokenizer.encode("\n", add_special_tokens=False)[-1]
        self.bos_token = self.tokenizer.bos_token_id

        print(self.start_token)
        print(self.end_token)
        print(self.new_line_token)

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            if_start = False
            for j in range(len(batch["labels"][i])):

                token = batch["labels"][i][j].item()
                
                if token == self.start_token:
                    if_start = True

                if if_start or token == self.bos_token:
                    batch["labels"][i][j] = self.ignore_index
                    
                if token == self.new_line_token:
                    if_start = False

        return batch

In [4]:
base_model = "ehartford/dolphin-2.2.1-mistral-7b" #someone13574/Mistral-7B-v0.1-sharded

tokenizer = AutoTokenizer.from_pretrained(base_model)
dataset = Dataset.from_dict({"session": prepare_dataset("./data/messages.json")})

data_collator = DataCollatorForLanguageModelingChatML(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


32001
32000
13


In [5]:
print(dataset[2500]["session"])

<|im_start|>Alexander Smirnov
>>> здарова нормик<|im_end|>
<|im_start|>Федук
>>> чо ты куда пропал то<|im_end|>
<|im_start|>Alexander Smirnov
>>> вечером сделаю бота
>>> сегодня еще дедлайн по домке<|im_end|>
<|im_start|>Федук
>>> ок, на один глаз залетишь на умник?<|im_end|>
<|im_start|>Alexander Smirnov
>>> на наш залечу<|im_end|>
<|im_start|>Федук
>>> кк<|im_end|>


In [6]:
collator_res = data_collator([tokenizer(dataset["session"][2500], return_tensors="pt")["input_ids"][0]])

In [7]:
print(tokenizer.decode(collator_res["labels"][0][collator_res["labels"][0] != -100]))

>>> здарова нормик<|im_end|> 
>>> чо ты куда пропал то<|im_end|> 
>>> вечером сделаю бота
>>> сегодня еще дедлайн по домке<|im_end|> 
>>> ок, на один глаз залетишь на умник?<|im_end|> 
>>> на наш залечу<|im_end|> 
>>> кк<|im_end|>


In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
wandb.init(project='Fine tuning mistral 7B', job_type="training", anonymous="allow")

[34m[1mwandb[0m: Currently logged in as: [33mfuriousteabag[0m. Use [1m`wandb login --relogin`[0m to force relogin


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f3b6df0fb20>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f3b6d68e830, execution_count=9 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f3b6d68e6e0, raw_cell="wandb.init(project='Fine tuning mistral 7B', job_t.." store_history=True silent=False shell_futures=True cell_id=6f9a82af-72ab-4890-9c36-914abc5f2b8e> result=<wandb.sdk.wandb_run.Run object at 0x7f3b67c038e0>>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [10]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f3b6df0fb20>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f3b67ce4eb0, raw_cell="model = prepare_model_for_kbit_training(model)
pef.." store_history=True silent=False shell_futures=True cell_id=9db352f3-3ddb-4678-b896-12a8d21dc3e2>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f3b6df0fb20>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f3b67ce48b0, execution_count=10 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f3b67ce4eb0, raw_cell="model = prepare_model_for_kbit_training(model)
pef.." store_history=True silent=False shell_futures=True cell_id=9db352f3-3ddb-4678-b896-12a8d21dc3e2> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [11]:
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 3,
    per_device_train_batch_size= 2,
    gradient_accumulation_steps= 8,
    optim = "paged_adamw_8bit",
    save_steps= 5000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="session",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    data_collator=data_collator
)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f3b6df0fb20>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f3b67ce45b0, raw_cell="training_arguments = TrainingArguments(
    output.." store_history=True silent=False shell_futures=True cell_id=e4da5949-7786-4c11-b679-9a6305cfc56e>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given



Map:   0%|          | 0/7982 [00:00<?, ? examples/s]

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f3b6df0fb20>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f3b67ce45e0, execution_count=11 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f3b67ce45b0, raw_cell="training_arguments = TrainingArguments(
    output.." store_history=True silent=False shell_futures=True cell_id=e4da5949-7786-4c11-b679-9a6305cfc56e> result=None>,),kwargs {}:




TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [None]:
trainer.train()
# Save the fine-tuned model
trainer.model.save_pretrained("finetuned-7b-telegram")
wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f3b6df0fb20>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f3b67ce4bb0, raw_cell="trainer.train()
# Save the fine-tuned model
traine.." store_history=True silent=False shell_futures=True cell_id=e3c8ccaa-fa36-4977-95e1-55b63fe49094>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
