In [1]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import torch
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import pandas as pd
from scripts.data_builder import parse_whatsapp_chat
from scripts.utils import load_model, chat_loop

class CustomTokenizer:
    def __init__(self, tokenizer, text_column, max_length):
        self.tokenizer = tokenizer
        self.text_column = text_column
        self.max_length = max_length

    def tokenize(self, examples):
        return self.tokenizer(examples[self.text_column], padding="max_length", truncation=True, max_length=self.max_length)

    def format_instruction(self, example):
        return {
            "text": f"""### Instrução:
            {example['instruction']}

            ### Entrada:
            {example['input']}

            ### Resposta:
            {example['output']}"""
                }

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

INITIALIZE MODEL AND TOKENIZER

In [4]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
tok = CustomTokenizer(tokenizer, "text", 256)

BUILD DATA FROM RAW WHATSAPP CHAT

In [10]:
parse_whatsapp_chat(
    input_path="data/whatsapp.txt",
    output_path="data/dataset.jsonl",
    main_character="Enzo Bustamante"
)

dataset = load_dataset("json", data_files={"train": "data/dataset.jsonl"})
dataset = dataset.map(tok.format_instruction)
tokenized_dataset = dataset.map(tok.tokenize, batched=True)

✅ Saved 20721 samples to data/dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20721 [00:00<?, ? examples/s]

Map:   0%|          | 0/20721 [00:00<?, ? examples/s]

In [11]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./character-model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    warmup_steps=10,
    logging_dir='./logs',
    num_train_epochs=1,
    max_steps=5000,
    save_strategy="epoch",
    fp16=True,
    gradient_checkpointing=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TRAIN

In [65]:
trainer.train()

Step,Training Loss
500,0.9394
1000,0.927
1500,0.925
2000,0.9143
2500,0.9317
3000,0.9184
3500,0.8992
4000,0.9129
4500,0.9237
5000,0.8907


TrainOutput(global_step=5000, training_loss=0.918228515625, metrics={'train_runtime': 6020.1107, 'train_samples_per_second': 1.661, 'train_steps_per_second': 0.831, 'total_flos': 1.0927208398848e+17, 'train_loss': 0.918228515625, 'epoch': 0.4826021910139472})

SAVE THE NEW MODEL

In [4]:
model.save_pretrained("models/model_2")
tokenizer.save_pretrained("models/model_2")

('models/model_2/tokenizer_config.json',
 'models/model_2/special_tokens_map.json',
 'models/model_2/tokenizer.json')

OR LOAD IF ALREADY TRAINED

In [2]:
#model, tokenizer = load_model("./model_2")

Loading configuration...
Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LoRA adapters...


CHAT WITH MODEL

In [None]:
chat_loop(model, tokenizer, "Enzo Bustamante", "Let 💛")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Enzo Bustamante: Oi amorzao
Enzo Bustamante: Tô bem
Enzo Bustamante: Te amooo
Enzo Bustamante: Como ta ai?
Enzo Bustamante: <Mídia oculta>
Enzo Bustamante: Comprando tudo pra o barbecue de hoje
Enzo Bustamante: Meu pai nao conseguiu comprar a car

Conversa encerrada. Até mais!
