In [1]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import torch
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import pandas as pd
from scripts.data_builder import parse_whatsapp_chat
from scripts.utils import load_model, Chat

class CustomTokenizer:
    def __init__(self, tokenizer, text_column, max_length):
        self.tokenizer = tokenizer
        self.text_column = text_column
        self.max_length = max_length

    def tokenize(self, examples):
        return self.tokenizer(examples[self.text_column], padding="max_length", truncation=True, max_length=self.max_length)

    def format_instruction(self, example):
        return {
            "text": f"""### Instrução:
            {example['instruction']}

            ### Entrada:
            {example['input']}

            ### Resposta:
            {example['output']}"""
                }

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

INITIALIZE MODEL AND TOKENIZER

In [6]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
tok = CustomTokenizer(tokenizer, "text", 256)

BUILD DATA FROM RAW WHATSAPP CHAT

In [10]:
parse_whatsapp_chat(
    input_path="data/whatsapp.txt",
    output_path="data/dataset.jsonl",
    main_character="Enzo Bustamante"
)

dataset = load_dataset("json", data_files={"train": "data/dataset.jsonl"})
dataset = dataset.map(tok.format_instruction)
tokenized_dataset = dataset.map(tok.tokenize, batched=True)

✅ Saved 20721 samples to data/dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20721 [00:00<?, ? examples/s]

Map:   0%|          | 0/20721 [00:00<?, ? examples/s]

In [11]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./character-model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    warmup_steps=10,
    logging_dir='./logs',
    num_train_epochs=2,
    #max_steps=10000,
    save_strategy="epoch",
    fp16=True,
    gradient_checkpointing=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TRAIN

In [12]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
500,1.0948
1000,0.9612
1500,0.9347
2000,0.923
2500,0.9384
3000,0.9243
3500,0.9047
4000,0.9176
4500,0.9253
5000,0.8911


TrainOutput(global_step=20720, training_loss=0.8809759383035903, metrics={'train_runtime': 24670.1118, 'train_samples_per_second': 1.68, 'train_steps_per_second': 0.84, 'total_flos': 4.528125888398623e+17, 'train_loss': 0.8809759383035903, 'epoch': 1.9998552193426957})

SAVE THE NEW MODEL

In [13]:
model.save_pretrained("models/model_2")
tokenizer.save_pretrained("models/model_2")

('models/model_2/tokenizer_config.json',
 'models/model_2/special_tokens_map.json',
 'models/model_2/tokenizer.model',
 'models/model_2/added_tokens.json',
 'models/model_2/tokenizer.json')

OR LOAD IF ALREADY TRAINED

In [2]:
model, tokenizer = load_model("models/model_2")

Loading configuration...
Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LoRA adapters...


CHAT WITH MODEL

In [7]:
chat = Chat(model, tokenizer, "Enzo Bustamante", "Let 💛", max_new_tokens=100, top_k=30, top_p=0.85, temperature=0.5, repetition_penalty=1.2)
chat.chat_loop()

Enzo Bustamante: Vamos sim amorzao
Enzo Bustamante: Acho que umas 7 e pouco eu saio do escritorio
Enzo Bustamante: E ai? Oq tem pra comer no ap?
Enzo Bustamante: Pode ser?
Enzo Bustamante: <Mídia oculta>
Enzo Bustamante: Ta bom o pao de queijo kkkk
En

Conversation ended. Goodbye!
