## **Preparação do Ambiente**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Instala as bibliotecas necessárias para realizar o fine-tuning de um modelo de linguagem com a biblioteca Unsloth

In [None]:
%%capture
import re
import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)

xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install -U bitsandbytes

## **Preparação do DataSet**

Faz a importação e tratamento do dataset salvando o resultado no arquivo 'filtered_dataset.json'.

In [None]:
import os
import pandas as pd
from datasets import load_dataset

project_path = "/content/drive/MyDrive/Colab_Notebooks/TechChallange_Fase3"
dataset_path = os.path.join(project_path, 'trn.json')

try:
    # Carrega o arquivo JSON-lines
    dataset = load_dataset('json', data_files=dataset_path, split='train')

    # Converte para um DataFrame
    df = pd.DataFrame(dataset)

    # Converte a lista de valores para string na coluna 'content'
    df['content'] = df['content'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

    # Remove duplicatas da coluna 'title'
    df_cleaned = df.drop_duplicates(subset=['title'])

    # Remove linhas sem valores
    df_cleaned = df_cleaned.dropna()

    # Filtra as entradas onde 'content' não está vazio
    df_filtered = df_cleaned[df_cleaned['content'].str.strip() != ''].copy()

    # Seleciona apenas as colunas 'title' e 'content'
    df_final = df_filtered[['title', 'content']].copy()

    # Limite de 100.000 registros
    df_final = df_final.head(100000)

    print("Dataset carregado, filtrado e pronto para formatação.")
    print(f"Número de registros originais: {len(df)}")
    print(f"Número de registros após filtragem: {len(df_final)}")
    print("Colunas do dataset final:", df_final.columns.tolist())

    # Define o caminho de saída para o arquivo JSON
    json_output_path = os.path.join(project_path, 'filtered_dataset.json')

    # Salva o DataFrame em um arquivo JSON
    df_final.to_json(json_output_path, orient='records', lines=True)

    print(f"Dataset salvo com sucesso em: {json_output_path}")

except FileNotFoundError:
    print(f"Erro: Dataset não encontrado em {dataset_path}")
except Exception as e:
    print(f"Ocorreu um erro: {e}")

Generating train split: 0 examples [00:00, ? examples/s]

Dataset carregado, filtrado e pronto para formatação.
Número de registros originais: 2248619
Número de registros após filtragem: 100000
Colunas do dataset final: ['title', 'content']
Dataset salvo com sucesso em: /content/drive/MyDrive/Colab_Notebooks/TechChallange_Fase3/filtered_dataset.json


## **Chamada do Foundation Model**

Carrega um modelo de linguagem pré-treinado (foundation model)

In [None]:
from unsloth import FastLanguageModel, get_chat_template # Import get_chat_template
import torch
max_seq_length = 1024
dtype = None # None for auto detection
load_in_4bit = True # 4bit quantization to reduce memory usage

fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct", # Este é o que iremos utilizar
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Chat template padrão
chat_template = """{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + content + '<|eot_id|>' }}{% elif message['role'] == 'assistant' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + content + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"""

tokenizer.chat_template = chat_template


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Configura o modelo para o fine-tuning, modelo PEFT

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.9.11 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## **Execução do Fine-Tuning**

Prepara o dataset para o fine-tuning, formatando o prompt usado no treinamento. O dataset resultante é colocado na variável *dataset*.

In [None]:
import json
import os
import random
import pandas as pd
from datasets import load_dataset, Dataset

def create_instruction_data(input_file):
    """
    Processa o dataset para criar exemplos de treinamento
    com instruções variadas para a tarefa de sumarização.
    """
    try:
        with open(input_file, 'r') as f:
            data = [json.loads(line) for line in f]

        formatted_data = []

        # Lista de templates de instrução para aumentar a variedade
        summary_instruction_templates = [
            "Give me a summary of the book '{}'.",
            "Please summarize the book titled '{}'.",
            "What is the book '{}' about? Provide a summary.",
            "Can you provide a synopsis for the book '{}'?",
            "I need a short summary of '{}'.",
        ]

        for book in data:
            title = book.get("title", "").strip()
            content = book.get("content", "").strip()

            if not title or not content:
                continue

            # Limita o comprimento do conteúdo para evitar exceder o max_seq_length
            if len(content) > 1500:
                content = content[:1500] + "..."

            # Escolhe um template aleatório da lista para cada livro
            instruction = random.choice(summary_instruction_templates).format(title)

            formatted_data.append({
                "instruction": instruction,
                "input": "",
                "output": content
            })

        return formatted_data

    except FileNotFoundError:
        print(f"Erro: Dataset não encontrado em {input_file}")
        return None
    except Exception as e:
        print(f"Ocorreu um erro: {e}")
        return None

def formatting_prompts_func(examples):
    """
    Formata o dataset para o formato final de treinamento usando o template de chat do tokenizador.
    """
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []

    for instruction, output in zip(instructions, outputs):
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output},
        ]

        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(formatted_text)

    return {"text": texts}

# --- Início da execução do código principal ---
project_path = "/content/drive/MyDrive/Colab_Notebooks/TechChallange_Fase3"
filtered_dataset_path = os.path.join(project_path, 'filtered_dataset.json')

# Executa a conversão dos dados
formatted_data_list = create_instruction_data(filtered_dataset_path)

if formatted_data_list is not None:
    # Cria o dataset a partir da lista de dicionários
    raw_dataset = Dataset.from_list(formatted_data_list)
    print(f"Dataset de dados brutos gerado com sucesso!")
    print(f"Número total de exemplos: {len(raw_dataset)}")

    # Aplica a formatação do prompt para o campo de texto
    training_dataset = raw_dataset.map(formatting_prompts_func, batched=True, num_proc=os.cpu_count())

    print("\nExemplo do primeiro item do dataset formatado:")
    print(training_dataset[0]["text"])

Dataset de dados brutos gerado com sucesso!
Número total de exemplos: 100000


Map (num_proc=12):   0%|          | 0/100000 [00:00<?, ? examples/s]


Exemplo do primeiro item do dataset formatado:
<|start_header_id|>user<|end_header_id|>

Give me a summary of the book 'Girls Ballet Tutu Neon Pink'.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

High quality 3 layer ballet tutu. 12 inches in length<|eot_id|>


Realiza o treinamento do modelo base (fine-tuning).

In [None]:
from trl import SFTConfig, SFTTrainer
from unsloth import FastLanguageModel
from trl import SFTConfig, SFTTrainer

output_path = "/content/drive/MyDrive/Colab_Notebooks/TechChallange_Fase3/model"
max_seq_length = 1024

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = training_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        logging_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = output_path,
        report_to = "none",
    ),
)

# Start training the model
trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/100000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 6,250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
100,1.8722
200,1.7597
300,1.7367
400,1.7298
500,1.7164
600,1.7123
700,1.7222
800,1.6966
900,1.7085
1000,1.7011


Step,Training Loss
100,1.8722
200,1.7597
300,1.7367
400,1.7298
500,1.7164
600,1.7123
700,1.7222
800,1.6966
900,1.7085
1000,1.7011


## **Geração de Respostas**

Faz o teste de inferência utilizando o modelo base (sem fine-tuning), para comparação de resultados.

In [None]:
from transformers import TextStreamer
from unsloth import FastLanguageModel
import torch

# Carrega o modelo e o tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
    device_map = "auto",
)

# Define o prompt de inferência
inference_prompt = "Give me a summary of the book 'The New Comics Anthology'."
messages = [
    {"role": "user", "content": inference_prompt},
]

# Aplica o template
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt"
).to("cuda")

# Cria o streamer para a exibição dos tokens
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

print("\n\n--- Resposta Final (sem fine-tuning): ---\n")

# Gera a resposta
_ = model.generate(
    inputs,
    streamer=text_streamer,
    max_new_tokens = 100,
    use_cache = True
)

==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


--- Resposta Final (sem fine-tuning): ---

"The New Comics Anthology" is a fictional title, and as such, I cannot provide a summary of an actual book. However, if this were a real anthology, it would likely be a collection of various comic stories, each with its own unique narrative and artistic style.

In a typical anthology of this nature, you might find a diverse range of themes, from superhero tales to more grounded, slice-of-life stories. Each


Faz o teste de inferência utilizando o modelo com fine-tuning.

In [None]:
from transformers import TextStreamer, BitsAndBytesConfig
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024
dtype = None
load_in_4bit = True

# Caminho para o modelo on Google Drive
model_path = "/content/drive/MyDrive/Colab_Notebooks/TechChallange_Fase3/model"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
)

# Carrega o modelo com fine-tuning
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    quantization_config = quantization_config,
    device_map = "auto",
)

# Prepara o modelo para a inferência
FastLanguageModel.for_inference(model)

# Define o prompt
inference_prompt = "Give me a summary of the book 'The New Comics Anthology'."
messages = [
    {"role": "user", "content": inference_prompt},
]

# Aplica o template
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt"
).to("cuda")

# Cria o streamer para a exibição dos tokens
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

print("\n\n--- Resposta Final (com fine-tuning): ---\n")

# Gera a resposta
_ = model.generate(
    inputs,
    streamer=text_streamer,
    max_new_tokens = 100,
    use_cache = True
)

==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


--- Resposta Final (com fine-tuning): ---

"The New Comics Anthology is a must-have for any comics fan. It's a great collection of the best of the best, and it's a great way to get a taste of the best of the new." --Jim Lee, creator of the X-Men, Avengers, and other comics"The New Comics Anthology is a great collection of the best of the best. It's a great way to get a taste of the best of the
