In [None]:
# Montar Drive + caminhos
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

BASE_DIR  = "/content/drive/MyDrive/tech-challenge-3"
TRAIN_PATH = f"{BASE_DIR}/trn.json.gz"
TEST_PATH  = f"{BASE_DIR}/tst.json.gz"
print("Train:", TRAIN_PATH)
print("Test :", TEST_PATH)

# Ambiente
import os, random, numpy as np, torch
from transformers import set_seed
os.environ["TOKENIZERS_PARALLELISM"] = "false"
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

# (Re)instalar só se faltar o pacote na sessão
# %pip -q install "transformers==4.41.2" "datasets==2.20.0" "accelerate==0.32.0" "peft==0.11.1" "sentencepiece==0.1.99" "rouge-score==0.1.2"

In [None]:
from datasets import load_dataset

def clean(x): return (x or "").strip()

raw = load_dataset("json", data_files={"train": TRAIN_PATH, "test": TEST_PATH})

def build_fields_en(example):
    title   = clean(example.get("title",""))
    content = clean(example.get("content",""))
    example["prompt"] = (
        "Task: Generate the product CONTENT from its TITLE.\n"
        f"TITLE: {title}\n"
        "CONTENT:"
    )
    example["target"] = content
    return example

raw = raw.map(build_fields_en)
raw = raw.filter(lambda ex: len(clean(ex["target"])) > 0)

# Amostras para treino/val
train_sample_size = 120_000
test_sample_size  = 4_000
train_ds = raw["train"].shuffle(seed=42).select(range(min(train_sample_size, len(raw["train"]))))
eval_ds  = raw["test"].shuffle(seed=42).select(range(min(test_sample_size,  len(raw["test"]))))
len(train_ds), len(eval_ds)


In [None]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

BASE_MODEL = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

MAX_INPUT, MAX_TARGET = 256, 256
def tokenize(batch):
    x = tokenizer(batch["prompt"], max_length=MAX_INPUT, truncation=True)
    y = tokenizer(text_target=batch["target"], max_length=MAX_TARGET, truncation=True)
    x["labels"] = y["input_ids"]
    return x

tokenized_train = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
tokenized_eval  = eval_ds.map(tokenize,  batched=True, remove_columns=eval_ds.column_names)
collator = DataCollatorForSeq2Seq(tokenizer)
print(tokenized_train)


In [None]:
# Modelo base (sem treino)
from transformers import AutoModelForSeq2SeqLM

base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
base_model.eval()

@torch.no_grad()
def infer_base(title):
    prompt = (
        "Task: Generate the product CONTENT from its TITLE.\n"
        f"TITLE: {title}\n"
        "CONTENT:"
    )
    x = tokenizer(prompt, return_tensors="pt").to(device)
    y = base_model.generate(**x, max_new_tokens=128, num_beams=4)
    return tokenizer.decode(y[0], skip_special_tokens=True).strip()

# Teste rápido ANTES do treino
print("ANTES do treino:", infer_base("Apple iPhone 15 Pro Max 256GB, Natural Titanium"))


In [None]:
# 4) Treino RÁPIDO com LoRA (compatível com versões novas/antigas do Transformers)

import os, gc, torch
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

# Limpar GPU
for obj in ["trainer", "model"]:
    try: del globals()[obj]
    except: pass
gc.collect(); torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Amostra rápida (ajuste se quiser mais)
FAST_TRAIN_SIZE = 40_000
small_train = tokenized_train.select(range(min(FAST_TRAIN_SIZE, len(tokenized_train))))
print("Train rápido:", len(small_train))

# Modelo + LoRA
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
model.config.use_cache = False
model = model.to(device)

lora_cfg = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, r=4, lora_alpha=8, lora_dropout=0.05, target_modules=["q","v"])
model = get_peft_model(model, lora_cfg)

# Args comuns (versão-agnóstico)
common_kwargs = dict(
    output_dir=f"{BASE_DIR}/out_fast_lora_en",
    max_steps=2000,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="no",
    report_to="none",
)

# Tenta usar a API nova; se não existir, usa a antiga
try:
    args = TrainingArguments(**common_kwargs, eval_strategy="no")            # Transformers >= 4.46
except TypeError:
    args = TrainingArguments(**common_kwargs, evaluation_strategy="no")      # Transformers <= 4.45

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train,
    tokenizer=tokenizer,
    data_collator=collator,
)

train_result = trainer.train()
print(train_result)

SAVE_DIR = f"{BASE_DIR}/flan_t5_small_lora_fast_en"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Salvo em:", SAVE_DIR)


In [None]:
from peft import PeftModel
from transformers import AutoModelForSeq2SeqLM

base_ft = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)
model_ft = PeftModel.from_pretrained(base_ft, SAVE_DIR).to(device)
model_ft.eval()

def make_prompt_en(title, question=None):
    return (
        "Task: Generate the product CONTENT from its TITLE.\n"
        f"TITLE: {title}\n"
        "CONTENT:"
    )

@torch.no_grad()
def infer(title, question="What is the content of this product?"):
    x = tokenizer(make_prompt_en(title, question), return_tensors="pt").to(device)
    y = model_ft.generate(
      **x,
      max_new_tokens=496,
      min_new_tokens=128,         # garante saída mínima
      do_sample=False,
      num_beams=4,
      length_penalty=1.0,
      no_repeat_ngram_size=3,
    )
    return tokenizer.decode(y[0], skip_special_tokens=True).strip()

# Smoke test
print(infer("Apple iPhone 15 Pro Max 256GB, Natural Titanium")[:200])


In [None]:
# ROUGE-L: instala o pacote se faltar e avalia
import sys, subprocess

# 1) Garantir a dependência
try:
    from rouge_score import rouge_scorer
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "rouge-score"])
    from rouge_score import rouge_scorer

# 2) Imports restantes
from tqdm import tqdm
import numpy as np

# 3) Sanidade rápida (espera que 'infer' e 'eval_ds' já existam das células anteriores)
if 'infer' not in globals():
    raise RuntimeError("Função 'infer' não encontrada. Execute a célula de INFERÊNCIA antes desta.")
if 'eval_ds' not in globals():
    raise RuntimeError("Dataset 'eval_ds' não encontrado. Execute a célula de DADOS antes desta.")

# 4) Avaliação
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def eval_rouge(sample=10, show=10):
    subset = eval_ds.select(range(min(sample, len(eval_ds))))
    f1s = []
    for i, ex in enumerate(tqdm(subset)):
        pred = infer(ex["title"])
        ref  = (ex["content"] or "").strip()
        f1s.append(scorer.score(ref, pred)["rougeL"].fmeasure)
        if i < show:
            print("\n" + "—"*40)
            print("[TITLE]", ex["title"][:160])
            print("[PRED ]", pred[:300])
            print("[GOLD ]", ref[:300])
    return {"rougeL_f1_mean": float(np.mean(f1s)), "n": len(subset)}

eval_rouge(10, show=10)
