In [13]:
%pip install transformers datasets scikit-learn --upgrade 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# 🎯 Classificação de Intenção de Chamadas com LLaMA 3 + BERT

Este notebook executa:

1. Carregamento de transcrições de chamadas (JSON)
2. Rotulagem automática com `llama3.2:3b` (via Ollama)
3. Criação de dataset para fine-tuning
4. Treinamento de modelo BERT com Hugging Face
5. Avaliação e exportação do modelo

## 🔁 1. Agregar transcrições

In [14]:
import os
import json
from tqdm import tqdm

TRANSCRIPTS_DIR = "transcripts"
dataset = []

for file in tqdm(os.listdir(TRANSCRIPTS_DIR)):
    if not file.endswith(".json"):
        continue
    with open(os.path.join(TRANSCRIPTS_DIR, file), "r", encoding="utf-8") as f:
        segments = json.load(f)
        full_text = " ".join([seg["text"] for seg in segments if seg["text"].strip()])
        if len(full_text) < 10:
            continue
        dataset.append({"filename": file, "text": full_text})

100%|██████████| 20/20 [00:00<00:00, 3495.84it/s]


## 🤖 2. Classificar intenção com LLaMA 3.2

In [15]:
import ollama

INTENT_LABELS = [
    "cancelamento",
    "reclamação",
    "segunda via",
    "informação técnica",
    "troca de titularidade",
    "elogio",
    "outros",
]

SYSTEM_PROMPT = f"""
Você é um assistente especialista em classificar intenções de chamadas de clientes de empresas de energia.

Dada a transcrição de uma chamada, classifique qual é a intenção principal da pessoa cliente.

Escolha uma das seguintes intenções:
{", ".join(INTENT_LABELS)}

Responda apenas com a intenção. Nada mais.
"""


def classify_intent(text):
    response = ollama.chat(
        model="llama3.2:3b",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": text},
        ],
    )
    return response["message"]["content"].strip().lower()


for entry in tqdm(dataset):
    entry["intent"] = classify_intent(entry["text"])

100%|██████████| 20/20 [00:11<00:00,  1.78it/s]


## 💾 3. Salvar dataset rotulado

In [16]:
import pandas as pd

df = pd.DataFrame(dataset)
df.to_csv("dataset/intent_dataset.csv", index=False)
df.head()

Unnamed: 0,filename,text,intent
0,b8cceb60-1be2-4e0d-b8d0-98eb4ccffc92_audio2.json,Olá! Você ligou para Equatorial Pará. Para inf...,religação
1,8941bfd6-3a4c-4cf4-a3d2-2f875586bc2d_audio2.json,Olá! Você ligou para Equatorial Pará. Para inf...,informação técnica
2,da6e2d9c-d629-4dd8-b8a6-c28696c6c506_audio2.json,Olá! Você ligou para Equatorial Maranhão. Para...,informação técnica
3,50596932-b962-4c8b-aca4-7b00d5694d4e_audio1.json,Boa noite meu amigo Essa é a falta de energia ...,reclamação
4,125eb62d-8ea5-41eb-9410-e63d644285a1_audio2.json,Olá! Você ligou para Equatorial Maranhão. Para...,informação técnica


In [17]:
intent_counts = df["intent"].value_counts()
print("Distribuição de Intenções:")
print(intent_counts)

Distribuição de Intenções:
intent
informação técnica    7
reclamação            7
outros                2
cancelamento          2
religação             1
elogio                1
Name: count, dtype: int64


## 🧠 4. Fine-tuning de BERT

In [18]:
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from sklearn.metrics import classification_report

MODEL_NAME = "neuralmind/bert-base-portuguese-cased"

# Carregar CSV e preparar labels
df = pd.read_csv("dataset/intent_dataset.csv")
LABELS = sorted(df["intent"].unique())
label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["intent"].map(label2id)

# Dataset HF
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

## 🔠 5. Tokenização e collator

In [19]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=512)


tokenized_dataset = dataset.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

## ⚙️ 6. Configurar modelo e treino

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(LABELS), id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./intent_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    logging_steps=1,
    weight_decay=0.01,
    save_strategy="no",
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score

    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 🚀 7. Treinar modelo

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
1,1.8144
2,1.5905
3,1.6625
4,1.3799
5,1.4232
6,1.3556
7,1.0818
8,1.5691
9,0.9717
10,1.528


TrainOutput(global_step=20, training_loss=1.2553155690431594, metrics={'train_runtime': 14.8927, 'train_samples_per_second': 10.744, 'train_steps_per_second': 1.343, 'total_flos': 40298549818176.0, 'train_loss': 1.2553155690431594, 'epoch': 10.0})

## 📈 8. Avaliação

In [22]:
predictions = trainer.predict(tokenized_dataset["test"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=-1)

# print(classification_report(y_true, y_pred, target_names=LABELS))

from sklearn.metrics import classification_report

# Descobrir quais rótulos realmente estão presentes
present_labels = sorted(set(y_true) | set(y_pred))

# Criar lista de nomes correspondente
present_names = [LABELS[i] for i in present_labels]

# Relatório
print(
    classification_report(
        y_true,
        y_pred,
        labels=present_labels,
        target_names=present_names,
        zero_division=0,
    )
)

                    precision    recall  f1-score   support

      cancelamento       0.00      0.00      0.00         1
informação técnica       0.00      0.00      0.00         2
        reclamação       0.25      1.00      0.40         1

          accuracy                           0.25         4
         macro avg       0.08      0.33      0.13         4
      weighted avg       0.06      0.25      0.10         4



In [23]:
report_dict = classification_report(
    y_true,
    y_pred,
    labels=present_labels,
    target_names=present_names,
    output_dict=True,
    zero_division=0,
)

import pandas as pd

pd.DataFrame(report_dict).transpose()

Unnamed: 0,precision,recall,f1-score,support
cancelamento,0.0,0.0,0.0,1.0
informação técnica,0.0,0.0,0.0,2.0
reclamação,0.25,1.0,0.4,1.0
accuracy,0.25,0.25,0.25,0.25
macro avg,0.083333,0.333333,0.133333,4.0
weighted avg,0.0625,0.25,0.1,4.0


## 💾 9. Salvar modelo

In [24]:
trainer.save_model("fine_tuned_intent_bert")
tokenizer.save_pretrained("fine_tuned_intent_bert")

('fine_tuned_intent_bert/tokenizer_config.json',
 'fine_tuned_intent_bert/special_tokens_map.json',
 'fine_tuned_intent_bert/vocab.txt',
 'fine_tuned_intent_bert/added_tokens.json',
 'fine_tuned_intent_bert/tokenizer.json')