In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Caminho do arquivo no Google Drive (após montar o drive no Colab)
arquivo = "/content/drive/MyDrive/Tech Challenge 3/trn.json"

In [None]:
import pandas as pd
import json

In [None]:
# Carrega o JSON linha a linha em uma lista
registros = []
max_registros = 100000

with open(arquivo, "r", encoding="utf-8") as f:
  for i, linha in enumerate(f):
    if i >= max_registros:
      break
    try:
      dado = json.loads(linha)
      registros.append(dado)
    except json.JSONDecodeError:
      continue

In [None]:
# Cria DataFrame
df = pd.DataFrame(registros)

In [None]:
# Mantém apenas colunas relevantes
df = df[["title", "content"]]

In [None]:
# Remove linhas com title ou content vazios/nulos
df = df.dropna(subset=["title", "content"])
df = df[(df["title"].str.strip() != "") & (df["content"].str.strip() != "")]
# Remove linhas com title duplicado
df = df.drop_duplicates(subset="title", keep="first")

print("Registros válidos:", len(df))
print(df.head())

Registros válidos: 7498
                          title  \
0   Girls Ballet Tutu Neon Pink   
3                 Mog's Kittens   
7   Girls Ballet Tutu Neon Blue   
12                  The Prophet   
13    Rightly Dividing the Word   

                                              content  
0   High quality 3 layer ballet tutu. 12 inches in...  
3   Judith Kerr&#8217;s best&#8211;selling adventu...  
7   Dance tutu for girls ages 2-8 years. Perfect f...  
12  In a distant, timeless place, a mysterious pro...  
13         --This text refers to thePaperbackedition.  


In [None]:
# Caminho do arquivo JSONL de saída
arquivo_jsonl = "/content/drive/MyDrive/Tech Challenge 3/tst2.json"

# Salva em JSONL (um registro por linha)
with open(arquivo_jsonl, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        json_obj = {
            "title": row["title"],
            "content": row["content"]
        }
        f.write(json.dumps(json_obj, ensure_ascii=False) + "\n")

print("Dataset exportado para JSONL com sucesso!")
print("Caminho:", arquivo_jsonl)


Dataset exportado para JSONL com sucesso!
Caminho: /content/drive/MyDrive/Tech Challenge 3/tst2.json


In [None]:
# !pip install ollama
# !pip install -q transformers torch

Collecting ollama
  Downloading ollama-0.6.0-py3-none-any.whl.metadata (4.3 kB)
Downloading ollama-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: ollama
Successfully installed ollama-0.6.0


In [None]:
# ollama run llama2
# !ollama serve &

time=2025-10-06T13:57:44.619Z level=INFO source=routes.go:1475 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:INFO OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_REMOTES:[ollama.com] OLLAMA_SCHED_SPRE

In [None]:
# !ollama serve
# !ollama start
# !ollama --version

time=2025-10-06T14:16:25.367Z level=INFO source=routes.go:1475 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:INFO OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_REMOTES:[ollama.com] OLLAMA_SCHED_SPRE

In [None]:
# !curl -fsSL https://ollama.com/install.sh | sh
# !ollama serve &
# !ollama pull llama3

>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
# !wich ollama
# !ollama --help

/bin/bash: line 1: wich: command not found


In [None]:
# import json
# import ollama

# def summarize_amazon(jsonl_file, output_file="amazon_summaries.json"):
#     summaries = []

#     with open(jsonl_file, 'r') as f:
#         for line in f:
#             item = json.loads(line)
#             title = item["title"]
#             content = item["content"]

#             prompt = f"Summarize the following Amazon product in one or two sentences:\n\nTitle: {title}\nDescription: {content}"

#             response = ollama.chat(
#                 model="llama2",  # ou 'llama3', 'mistral', dependendo do que você tiver baixado
#                 messages=[
#                     {"role": "system", "content": "You are a helpful assistant that summarizes products."},
#                     {"role": "user", "content": prompt}
#                 ]
#             )

#             summary_text = response["message"]["content"].strip()

#             summaries.append({
#                 "title": title,
#                 "content": content,
#                 "summary": summary_text
#             })

#     with open(output_file, 'w') as out:
#         json.dump({"amazon_summaries": summaries}, out, indent=2)

# # exemplo de chamada
# summarize_amazon("/content/drive/MyDrive/Tech Challenge 3/trn2.json")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Tech Challenge 3/trn2.json'

In [None]:
# !pip install huggingface_hub -q

In [None]:
#!pip install transformers accelerate sentencepiece -q

# from transformers import pipeline
# import json

# # Carrega um modelo pequeno estilo LLaMA (pode trocar por outro mais forte, ex: meta-llama/Llama-2-7b-hf, mas vai pesar no Colab!)
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# def summarize_amazon(jsonl_file, output_file="amazon_summaries.json"):
#     summaries = []

#     with open(jsonl_file, 'r') as f:
#         for line in f:
#             item = json.loads(line)
#             title = item["title"]
#             content = item["content"]

#             text = f"Title: {title}\nDescription: {content}"
#             summary = summarizer(text, max_length=50, min_length=15, do_sample=False)[0]["summary_text"]

#             summaries.append({
#                 "title": title,
#                 "content": content,
#                 "summary": summary
#             })

#     with open(output_file, 'w') as out:
#         json.dump({"amazon_summaries": summaries}, out, indent=2)

# # exemplo de chamada
# summarize_amazon("/content/drive/MyDrive/Tech Challenge 3/trn2.json")


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Tech Challenge 3/trn2.json'

In [None]:
!pip install transformers accelerate sentencepiece -q

In [None]:
from transformers import pipeline
import json

**Carregar modelo de sumarização (BART)**

In [None]:
# Carregar modelo de sumarização (BART)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_amazon(jsonl_file, output_file="amazon_summaries.json"):
    summaries = []

    with open(jsonl_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            title = item["title"]
            content = item["content"]

            text = f"Title: {title}\nDescription: {content}"
            summary = summarizer(
                text,
                max_length=50,   # limite máximo de tokens no resumo
                min_length=15,   # mínimo de tokens no resumo
                do_sample=False
            )[0]["summary_text"]

            summaries.append({
                "title": title,
                "content": content,
                "summary": summary
            })

    with open(output_file, 'w') as out:
        json.dump({"amazon_summaries": summaries}, out, indent=2)

# exemplo de chamada
summarize_amazon("/content/drive/MyDrive/Tech Challenge 3/tst2.json")


Device set to use cpu
Your max_length is set to 50, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 50, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 50, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 50, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max

KeyboardInterrupt: 

NOVO TESTE

In [None]:
!pip install transformers accelerate sentencepiece tqdm -q
import json
from tqdm import tqdm
from transformers import pipeline

# Inicializa o modelo (você pode trocar por "t5-base" se quiser)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_amazon(jsonl_file, output_file="amazon_summaries_stream.json", flush_every=10):
    """
    Gera resumos a partir de um arquivo JSONL e grava em disco incrementalmente.

    Parâmetros:
      - jsonl_file: caminho do arquivo de entrada (formato JSONL)
      - output_file: caminho do arquivo de saída (JSON)
      - flush_every: a cada quantos itens deve gravar no disco
    """
    count = 0

    # Conta quantas linhas existem para exibir progresso
    with open(jsonl_file, 'r') as f:
        total_lines = sum(1 for _ in f)

    with open(jsonl_file, 'r') as f_in, open(output_file, 'w') as f_out:
        f_out.write('{"amazon_summaries": [\n')
        first_item = True

        # tqdm mostra progresso sem carregar tudo na RAM
        for line in tqdm(f_in, total=total_lines, desc="Gerando resumos"):
            try:
                item = json.loads(line)
                title = item.get("title", "")
                content = item.get("content", "")

                text = f"Title: {title}\nDescription: {content}"

                # Geração do resumo
                summary = summarizer(
                    text,
                    max_length=20,
                    min_length=15,
                    do_sample=False
                )[0]["summary_text"]

            except Exception as e:
                summary = f"[ERROR: {e}]"

            # Formata o item a ser gravado
            result = {
                "title": title,
                "content": content,
                "summary": summary
            }

            if not first_item:
                f_out.write(",\n")
            json.dump(result, f_out, ensure_ascii=False)
            first_item = False

            count += 1
            if count % flush_every == 0:
                f_out.flush()  # grava no disco

        f_out.write("\n]}\n")

    print(f"\n✅ Concluído! {count} produtos processados e salvos em {output_file}")

Device set to use cpu


In [None]:
summarize_amazon("/content/drive/MyDrive/Tech Challenge 3/tst2.json")

Gerando resumos:   0%|          | 1/100000 [00:23<645:00:38, 23.22s/it]Your max_length is set to 20, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Gerando resumos:   0%|          | 2/100000 [00:40<540:37:08, 19.46s/it]Your max_length is set to 20, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Gerando resumos:   0%|          | 4/100000 [01:11<477:01:44, 17.17s/it]Your max_length is set to 20, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Gerando resumos:   0%|          | 10/100000 [0

In [None]:
# #!pip install transformers accelerate sentencepiece -q

# from transformers import pipeline
# import json

# # Carregar modelo de sumarização (T5-base)
# summarizer = pipeline("summarization", model="t5-base")

# def summarize_amazon(jsonl_file, output_file="/content/drive/MyDrive/Tech Challenge 3/amazon_summaries_test.json", limit=5):
#     summaries = []

#     with open(jsonl_file, 'r') as f:
#         for i, line in enumerate(f):
#             if i >= limit:  # só processa os primeiros N itens
#                 break
#             item = json.loads(line)
#             title = item["title"]
#             content = item["content"]

#             text = f"summarize: Title: {title}\nDescription: {content}"
#             summary = summarizer(
#                 text,
#                 max_length=20,   # limite máximo de tokens no resumo
#                 min_length=10,   # mínimo de tokens no resumo
#                 do_sample=False
#             )[0]["summary_text"]

#             summaries.append({
#                 "title": title,
#                 "content": content,
#                 "summary": summary
#             })

#     with open(output_file, 'w') as out:
#         json.dump({"/content/drive/MyDrive/Tech Challenge 3/amazon_summaries": summaries}, out, indent=2)

#     print(f"Arquivo salvo em {output_file}")

# # exemplo de chamada (teste com 5 produtos)
# summarize_amazon("/content/drive/MyDrive/Tech Challenge 3/tst2.json", limit=5)


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 20, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/t

KeyboardInterrupt: 

***Executando Este***

In [None]:
# #!pip install transformers accelerate sentencepiece -q

# from transformers import pipeline
# import json

# # Carregar modelo de sumarização (T5-base)
# summarizer = pipeline("summarization", model="t5-base")

# def summarize_amazon(jsonl_file, output_file="/content/drive/MyDrive/Tech Challenge 3/amazon_summaries.json"):
#     summaries = []

#     with open(jsonl_file, 'r') as f:
#         for line in f:
#             item = json.loads(line)
#             title = item["title"]
#             content = item["content"]

#             text = f"summarize: Title: {title}\nDescription: {content}"
#             summary = summarizer(
#                 text,
#                 max_length=20,   # limite máximo de tokens no resumo
#                 min_length=10,   # mínimo de tokens no resumo
#                 do_sample=False
#             )[0]["summary_text"]

#             summaries.append({
#                 "title": title,
#                 "content": content,
#                 "summary": summary
#             })

#     with open(output_file, 'w') as out:
#         json.dump({"/content/drive/MyDrive/Tech Challenge 3/amazon_summaries": summaries}, out, indent=2)

#     print(f"✅ Resumos gerados e salvos em {output_file}")

# # Executar para todos os itens
# summarize_amazon("/content/drive/MyDrive/Tech Challenge 3/tst2.json")


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 20, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/t

In [None]:
# !pip install huggingface_hub -q

In [None]:
# import json
# from huggingface_hub import InferenceClient
# from google.colab import userdata

# # Criar cliente da API do HuggingFace
# # Get the API key from Colab secrets
# hf_api_key = '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' #userdata.get('HF_TOKEN')

# if not hf_api_key:
#     print("Error: Hugging Face API key not found in Colab secrets. Please add it as 'HF_TOKEN'.")
# else:
#     client = InferenceClient(api_key=hf_api_key)

#     def summarize_news(news_file, output_file="news_summaries.json", model="meta-llama/Llama-2-7b-chat-hf"):
#         # Carrega o conteúdo das notícias de um arquivo JSON
#         try:
#             with open(news_file, 'r') as file:
#                 news_data = json.load(file)
#                 # Check if 'news_content' key exists and is a list
#                 if 'news_content' in news_data and isinstance(news_data['news_content'], list):
#                     news_contents = news_data['news_content']
#                 else:
#                     print(f"Error: 'news_content' key not found or is not a list in {news_file}")
#                     return
#         except FileNotFoundError:
#             print(f"Error: File not found at {news_file}")
#             return
#         except json.JSONDecodeError:
#             print(f"Error: Could not decode JSON from {news_file}")
#             return


#         summaries = []

#         for content in news_contents:
#             prompt = f"Summarize the following news article in 2-3 sentences:\n\n{content}"

#             try:
#                 # Chama a Inference API do HuggingFace
#                 response = client.text_generation(
#                     model=model,
#                     prompt=prompt,
#                     max_new_tokens=120,
#                     temperature=0.7
#                 )

#                 summary_text = response.strip()
#                 print(summary_text)

#                 summaries.append({
#                     "story": content,
#                     "summary": summary_text
#                 })
#             except Exception as e:
#                 print(f"Error summarizing content: {e}")
#                 summaries.append({
#                     "story": content,
#                     "summary": f"Error summarizing: {e}"
#                 })


#         # Salva os resultados em um arquivo JSON
#         try:
#             with open(output_file, 'w') as json_file:
#                 json.dump({"news_summaries": summaries}, json_file, indent=2)
#             print(f"✅ Resumos salvos em {output_file}")
#         except IOError as e:
#             print(f"Error saving summaries to {output_file}: {e}")


#     # Exemplo de chamada
#     summarize_news('/content/drive/MyDrive/news_contents.json')

Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing content: 
Error summarizing co

In [None]:
!pip install transformers datasets evaluate rouge_score sentencepiece -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


**Carregar o Dataset**

In [None]:
import json
from datasets import Dataset

def load_amazon_summaries(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)

    items = data["/content/drive/MyDrive/Tech Challenge 3/amazon_summaries"]

    inputs, outputs = [], []
    for item in items:
        title = item["title"]
        content = item["content"]
        summary = item["summary"]

        # Cria input estilo T5
        text = f"summarize: Title: {title}. Description: {content}"
        inputs.append(text)
        outputs.append(summary)

    return Dataset.from_dict({"input": inputs, "output": outputs})

dataset = load_amazon_summaries("/content/drive/MyDrive/Tech Challenge 3/amazon_summaries_stream.json")
dataset = dataset.train_test_split(test_size=0.2)

dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 4
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 1
    })
})

**Tokenizar**

In [None]:
from transformers import AutoTokenizer

model_name = "t5-base"  # ou "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(examples):
    model_inputs = tokenizer(examples["input"], max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(examples["output"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

**Configurar modelo + treino**

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer, GenerationConfig, DataCollatorForSeq2Seq
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: v.mid.fmeasure for k, v in result.items()}

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define generation configuration and pass it to the model
generation_config = GenerationConfig(max_new_tokens=64)
model.generation_config = generation_config

training_args = TrainingArguments(
    output_dir="./t5-finetuned-amazon",
    do_eval=True,                  # habilita avaliação
    # evaluate_during_training=True, # legado das versões antigas
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=20,
)

# Initialize DataCollatorForSeq2Seq with the tokenizer and model, and set label_pad_token_id
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


**Treinar**

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'pad_token_id': 0}.


Step,Training Loss


TrainOutput(global_step=3, training_loss=8.9047482808431, metrics={'train_runtime': 166.4834, 'train_samples_per_second': 0.072, 'train_steps_per_second': 0.018, 'total_flos': 1855418572800.0, 'train_loss': 8.9047482808431, 'epoch': 3.0})

**Testar modelo fine-tunado**

In [None]:
text = "summarize: Title: Girls Ballet Tutu Neon Pink. Description: High quality 3 layer ballet tutu. 12 inches in length"
inputs = tokenizer(text, return_tensors="pt")

outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Girls Ballet Tutu Neon Pink. Description: High quality 3 layer ballet tutu. 12 inches in length.


**Salvar modelo**

In [None]:
model.save_pretrained("/content/my-t5-amazon")
tokenizer.save_pretrained("/content/my-t5-amazon")


('/content/my-t5-amazon/tokenizer_config.json',
 '/content/my-t5-amazon/special_tokens_map.json',
 '/content/my-t5-amazon/spiece.model',
 '/content/my-t5-amazon/added_tokens.json',
 '/content/my-t5-amazon/tokenizer.json')

**Validação**

In [None]:
from transformers import pipeline

# carregar modelo fine-tunado
summarizer = pipeline("summarization", model="/content/my-t5-amazon", tokenizer="/content/my-t5-amazon")

# Função para validar um item
def validate_summary(title, description, expected_summary=None):
    text = f"summarize: Title: {title}. Description: {description}"
    result = summarizer(text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

    print("🔹 Produto:", title)
    print("📝 Resumo gerado:", result)
    if expected_summary:
        print("✅ Resumo esperado:", expected_summary)
    print("-" * 80)

# Teste com itens do dataset
validate_summary(
    "Girls Ballet Tutu Neon Pink",
    "High quality 3 layer ballet tutu. 12 inches in length",
    "Girls Ballet Tutu Neon Pink Description: High quality 3 layer ballet tutu."
)

validate_summary(
    "Mog's Kittens",
    "Judith Kerr’s best–selling adventures of that cat Mog have entertained children for more than 30 years...",
    "Judith Kerr's best-selling books have entertained children for more than 30 years . these sturdy little board books are just the thing to delight the very young ."
)

# Teste com produto novo (fora do dataset)
validate_summary(
    "Kids Soccer Ball",
    "Durable and lightweight soccer ball designed for beginners and young players.",
    None
)


Device set to use cpu
Your max_length is set to 50, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)


🔹 Produto: Girls Ballet Tutu Neon Pink
📝 Resumo gerado: Girls Ballet Tutu Neon Pink. High quality 3 layer ballet tutu. 12 inches in length.
✅ Resumo esperado: Girls Ballet Tutu Neon Pink Description: High quality 3 layer ballet tutu.
--------------------------------------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


🔹 Produto: Mog's Kittens
📝 Resumo gerado: Judith Kerr's best–selling adventures of that cat Mog have entertained children for more than 30 years .
✅ Resumo esperado: Judith Kerr's best-selling books have entertained children for more than 30 years . these sturdy little board books are just the thing to delight the very young .
--------------------------------------------------------------------------------
🔹 Produto: Kids Soccer Ball
📝 Resumo gerado: the kids soccer ball is designed for beginners and young players . it is a lightweight and durable ball designed for players of all ages .
--------------------------------------------------------------------------------


In [None]:
import evaluate
from transformers import pipeline

# Carregar modelo fine-tunado
summarizer = pipeline("summarization", model="/content/my-t5-amazon", tokenizer="/content/my-t5-amazon")

# --------------------------
# 🔸 1. Validação manual
# --------------------------
def validate_summary(title, description, expected_summary=None):
    text = f"summarize: Title: {title}. Description: {description}"
    result = summarizer(text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

    print("🔹 Produto:", title)
    print("📝 Resumo gerado:", result)
    if expected_summary:
        print("✅ Resumo esperado:", expected_summary)
    print("-" * 80)

# Exemplos
validate_summary(
    "Girls Ballet Tutu Neon Pink",
    "High quality 3 layer ballet tutu. 12 inches in length",
    "Girls Ballet Tutu Neon Pink Description: High quality 3 layer ballet tutu."
)

validate_summary(
    "Mog's Kittens",
    "Judith Kerr’s best–selling adventures of that cat Mog have entertained children for more than 30 years...",
    "Judith Kerr's best-selling books have entertained children for more than 30 years . these sturdy little board books are just the thing to delight the very young ."
)

# Produto novo (não visto no treino)
validate_summary(
    "Kids Soccer Ball",
    "Durable and lightweight soccer ball designed for beginners and young players.",
    None
)

# --------------------------
# 🔸 2. Validação automática com ROUGE
# --------------------------
rouge = evaluate.load("rouge")

predictions = []
references = []

for item in dataset["test"]:  # dataset carregado antes do treino
    text = f"summarize: {item['input']}"
    result = summarizer(text, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]

    predictions.append(result)
    references.append(item["output"])

# Calcular métricas ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("\n📊 Resultados ROUGE:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


Device set to use cpu
Your max_length is set to 50, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/t

🔹 Produto: Girls Ballet Tutu Neon Pink
📝 Resumo gerado: Girls Ballet Tutu Neon Pink. High quality 3 layer ballet tutu. 12 inches in length.
✅ Resumo esperado: Girls Ballet Tutu Neon Pink Description: High quality 3 layer ballet tutu.
--------------------------------------------------------------------------------


Your max_length is set to 50, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


🔹 Produto: Mog's Kittens
📝 Resumo gerado: Judith Kerr's best–selling adventures of that cat Mog have entertained children for more than 30 years .
✅ Resumo esperado: Judith Kerr's best-selling books have entertained children for more than 30 years . these sturdy little board books are just the thing to delight the very young .
--------------------------------------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


🔹 Produto: Kids Soccer Ball
📝 Resumo gerado: the kids soccer ball is designed for beginners and young players . it is a lightweight and durable ball designed for players of all ages .
--------------------------------------------------------------------------------


Your max_length is set to 50, but your input_length is only 22. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



📊 Resultados ROUGE:
rouge1: 1.0000
rouge2: 1.0000
rougeL: 1.0000
rougeLsum: 1.0000
