In [7]:
!pip install -q transformers datasets sentence-transformers accelerate scikit-learn

In [8]:
import torch
from datasets import load_dataset
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
)
from sentence_transformers import SentenceTransformer
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)
import numpy as np
import random
from tqdm.auto import tqdm

# Reprodutibilidade básica
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [9]:
# ============================
# 3.1 Carregar dataset IMDb
# ============================
dataset = load_dataset("stanfordnlp/imdb")

# Para caber bem no Colab, vamos usar subconjuntos menores
N_TRAIN = 2000
N_TEST = 200  # pode aumentar depois se estiver rápido

train_data = dataset["train"].shuffle(seed=SEED).select(range(N_TRAIN))
test_data  = dataset["test"].shuffle(seed=SEED).select(range(N_TEST))

label_map = {0: "negative", 1: "positive"}

print("Exemplo de dado de treino:")
print(train_data[0])
print("Tamanhos -> train:", len(train_data), "| test:", len(test_data))

# ============================
# 3.2 Baseline: Zero-Shot com BART-MNLI
# ============================
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0 if device == "cuda" else -1
)

candidate_labels = ["positive", "negative"]

def predict_zero_shot(text: str) -> str:
    result = zero_shot_classifier(
        text,
        candidate_labels=candidate_labels,
        multi_label=False
    )
    # Pega o label com maior score
    return result["labels"][0]

y_true = []
y_pred_zero_shot = []

for sample in tqdm(test_data, desc="Baseline Zero-Shot (BART-MNLI)"):
    text = sample["text"]
    true_label = label_map[sample["label"]]
    pred_label = predict_zero_shot(text)
    y_true.append(true_label)
    y_pred_zero_shot.append(pred_label)

print("=== Métricas Baseline: BART Zero-Shot ===")
print(classification_report(y_true, y_pred_zero_shot, digits=4))
print("Matriz de confusão:")
print(confusion_matrix(y_true, y_pred_zero_shot))


Exemplo de dado de treino:
{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}
Tamanhos -> train: 2000 | test: 200


Device set to use cuda:0


Baseline Zero-Shot (BART-MNLI):   0%|          | 0/200 [00:00<?, ?it/s]

=== Métricas Baseline: BART Zero-Shot ===
              precision    recall  f1-score   support

    negative     0.8785    0.9038    0.8910       104
    positive     0.8925    0.8646    0.8783        96

    accuracy                         0.8850       200
   macro avg     0.8855    0.8842    0.8847       200
weighted avg     0.8852    0.8850    0.8849       200

Matriz de confusão:
[[94 10]
 [13 83]]


In [10]:
# ============================
# 4. RAG: Retriever com MiniLM
# ============================

# Carrega modelo de embeddings
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

# Usamos parte do train_data como base de exemplos rotulados
N_BASE = 1000  # pode aumentar se o Colab aguentar
base_examples = train_data.select(range(N_BASE))

base_texts  = [ex["text"] for ex in base_examples]
base_labels = [label_map[ex["label"]] for ex in base_examples]

print("Gerando embeddings da base de exemplos...")
base_embeddings = embedder.encode(
    base_texts,
    convert_to_tensor=True,
    batch_size=64,
    show_progress_bar=True
)

base_embeddings.shape


Gerando embeddings da base de exemplos...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

torch.Size([1000, 384])

In [11]:
# ============================
# 5. LLM TinyLlama e funções de RAG
# ============================

llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
# Alguns modelos não têm pad_token definido
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained(
        llm_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        llm_model_name,
        torch_dtype=torch.float32,
    )
    model.to(device)


def retrieve_examples(query_text: str, k: int = 3):
    """
    Recupera k exemplos mais similares da base usando produto interno.
    """
    query_emb = embedder.encode(query_text, convert_to_tensor=True)
    scores = torch.matmul(base_embeddings, query_emb)
    top_k = torch.topk(scores, k=k)
    indices = top_k.indices.cpu().numpy().tolist()

    retrieved = []
    for idx in indices:
        retrieved.append(
            {
                "text": base_texts[idx],
                "label": base_labels[idx],
            }
        )
    return retrieved


def build_prompt(review_text: str, examples, language: str = "en") -> str:
    """
    Monta um prompt few-shot com exemplos rotulados.
    """
    few_shot_blocks = []
    for i, ex in enumerate(examples, start=1):
        few_shot_blocks.append(
            f"Example {i}:\nReview: {ex['text']}\nLabel: {ex['label']}\n"
        )
    few_shot_text = "\n".join(few_shot_blocks)

    system_instruction = (
        "You are a sentiment classifier. "
        "You must classify movie reviews as exactly one label: positive or negative.\n"
    )

    prompt = (
        system_instruction
        + "\nHere are some labeled examples:\n"
        + few_shot_text
        + "\nNow classify the sentiment of the following review as exactly one word: positive or negative.\n"
        + f"Review:\n{review_text}\n\nAnswer only with: positive or negative.\nLabel:"
    )
    return prompt


def generate_label_with_llm(prompt: str, max_new_tokens: int = 5) -> str:
    """
    Chama o TinyLlama e extrai 'positive' ou 'negative' do output.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Pega só o que vem depois de "Label:"
    if "Label:" in full_text:
        answer = full_text.split("Label:")[-1].strip()
    else:
        answer = full_text.strip()

    answer = answer.lower()
    if "positive" in answer:
        return "positive"
    if "negative" in answer:
        return "negative"

    # fallback
    return "positive"


def predict_rag(review_text: str, k: int = 3) -> str:
    """
    Pipeline RAG completo para um review:
    - recupera exemplos
    - monta prompt
    - pede pro LLM o rótulo
    """
    examples = retrieve_examples(review_text, k=k)
    prompt = build_prompt(review_text, examples)
    label = generate_label_with_llm(prompt)
    return label


In [12]:
# ============================
# 6. Avaliação do RAG (TinyLlama + MiniLM)
# ============================

y_pred_rag = []

for sample in tqdm(test_data, desc="RAG TinyLlama + MiniLM"):
    text = sample["text"]
    true_label = label_map[sample["label"]]  # mesmo mapeamento usado no baseline
    pred_label = predict_rag(text, k=3)
    y_pred_rag.append(pred_label)

print("=== Métricas RAG: TinyLlama + Retrieval ===")
print(classification_report(y_true, y_pred_rag, digits=4))
print("Matriz de confusão:")
print(confusion_matrix(y_true, y_pred_rag))


RAG TinyLlama + MiniLM:   0%|          | 0/200 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2183 > 2048). Running this sequence through the model will result in indexing errors


=== Métricas RAG: TinyLlama + Retrieval ===
              precision    recall  f1-score   support

    negative     0.0000    0.0000    0.0000       104
    positive     0.4800    1.0000    0.6486        96

    accuracy                         0.4800       200
   macro avg     0.2400    0.5000    0.3243       200
weighted avg     0.2304    0.4800    0.3114       200

Matriz de confusão:
[[  0 104]
 [  0  96]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
