#### Instalação de Dependências

In [None]:
!pip show protobuf

Name: protobuf
Version: 3.20.3
Summary: Protocol Buffers
Home-page: https://developers.google.com/protocol-buffers/
Author: 
Author-email: 
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: google-api-core, google-cloud-bigquery, google-cloud-bigquery-connection, google-cloud-bigquery-storage, google-cloud-datastore, google-cloud-firestore, google-cloud-functions, google-cloud-language, google-cloud-translate, googleapis-common-protos, grpc-google-iam-v1, grpcio-status, orbax-checkpoint, proto-plus, tensorboard, tensorflow, tensorflow-datasets, tensorflow-hub, tensorflow-metadata


In [None]:
!pip install datasets==2.11.0 -q
!pip install evaluate==0.4.0 -q
!pip install rouge_score==0.1.2 -q
!pip install loralib==0.1.1 -q
!pip install peft==0.3.0 -q
!pip install sentencepiece==0.1.99 -q
!pip install pandas==2.0.3 -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/468.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/468.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

#### Configurando Google Drive

In [None]:
from google.colab import drive
MOUNT_PATH = '/content/drive'
drive.mount(MOUNT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dependências

In [None]:
import os
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict

## Parâmetros

In [None]:
MAX_LENGTH = 200
PROJECT_PATH = os.path.join(MOUNT_PATH, "MyDrive", "Projetos", "pocos-de-caldas-news")
RAW_DATA_PATH = os.path.join(PROJECT_PATH, "data", "processed", "processed-pocos-news.csv")

PROMPT_TEMPLATE = """
Categorize a seguinte notícia:

{content}

Categoria:
"""

BASE_MODEL = 'unicamp-dl/ptt5-large-portuguese-vocab'


FINE_TUNED_MODEL_PATH = os.path.join(PROJECT_PATH, "models", "finetune", "finetuned-"+BASE_MODEL.split("/")[-1])

## Funções

In [None]:
def ttv_split(ds, train_ratio, test_ratio):
    valid_ratio = 1 - train_ratio - test_ratio
    intermediate = ds.train_test_split(train_size=train_ratio)
    test_and_valid = intermediate['test']
    final = test_and_valid.train_test_split(train_size=test_ratio / (test_ratio + valid_ratio))

    return DatasetDict({
        'train': intermediate['train'],
        'test': final['train'],
        'validation': final['test']})

def convert_to_huggingdface_dataset(df):
    df_to_train = df[~df['predict']]
    df_to_predict = df[df['predict']]
    hf_ds = Dataset.from_pandas(df_to_train)
    hf_ds = ttv_split(hf_ds, train_ratio = 0.85, test_ratio=0.10)
    hf_ds['predict'] = Dataset.from_pandas(df_to_predict)
    return hf_ds


def tokenize_function(example: dict) -> dict:

    prompt = [PROMPT_TEMPLATE.format(content=content) for content in example["content"]]

    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=MAX_LENGTH
    ).input_ids

    example['labels'] = tokenizer(
        example["categories"],
        padding="max_length", truncation=True,
        return_tensors="pt",
        max_length=30
    ).input_ids

    return example

### Dataset

In [None]:
df = pd.read_csv(RAW_DATA_PATH).drop(columns=["Unnamed: 0", "id"], errors="ignore")

hf_ds = convert_to_huggingdface_dataset(df)

hf_ds_predict = hf_ds['predict']

del(hf_ds['predict'])

hf_ds

DatasetDict({
    train: Dataset({
        features: ['content', 'categories', 'predict', '__index_level_0__'],
        num_rows: 7975
    })
    test: Dataset({
        features: ['content', 'categories', 'predict', '__index_level_0__'],
        num_rows: 938
    })
    validation: Dataset({
        features: ['content', 'categories', 'predict', '__index_level_0__'],
        num_rows: 470
    })
})

In [None]:
from functools import reduce

ranking = pd.Series(reduce(lambda x, y: x+y, [x.split(",") for x in df['categories'].astype("str")])).value_counts()
ranking

Geral             4119
Policial          3326
nan               2419
Economia           620
Esporte            429
Política           325
Cultura            293
Educação           229
cidade             172
Entretenimento     171
Name: count, dtype: int64

## Base Model

In [None]:
from transformers import T5Tokenizer

model_name = BASE_MODEL

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

tokenizer = T5Tokenizer.from_pretrained(model_name)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [None]:

def tokenize_function(example: dict) -> dict:

    prompt = [PROMPT_TEMPLATE.format(content=content) for content in example["content"]]

    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=MAX_LENGTH
    ).input_ids

    example['labels'] = tokenizer(
        example["categories"],
        padding="max_length", truncation=True,
        return_tensors="pt",
        max_length=10
    ).input_ids

    return example

tokenized_datasets = hf_ds.map(tokenize_function, batched=True)
# tokenized_datasets = tokenized_datasets.remove_columns(['content', 'categories'])

tokenized_datasets['train'].to_pandas()

Map:   0%|          | 0/7975 [00:00<?, ? examples/s]

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

Map:   0%|          | 0/470 [00:00<?, ? examples/s]

Unnamed: 0,content,categories,predict,__index_level_0__,input_ids,labels
0,A consulta a valores esquecidos no sistema fin...,Geral,False,10943,"[15966, 5040, 7982, 7, 273, 6742, 46, 25, 6796...","[2250, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
1,"Um homem foi atropelado na tarde deste sábado,...",Geral,False,9168,"[15966, 5040, 7982, 7, 273, 6742, 46, 292, 878...","[2250, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Quatro pessoas ficaram feridas após sofrerem u...,Geral,False,7635,"[15966, 5040, 7982, 7, 273, 6742, 46, 6531, 24...","[2250, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,O Ministério da Saúde informou nesta quinta-fe...,Geral,False,4604,"[15966, 5040, 7982, 7, 273, 6742, 46, 28, 1730...","[2250, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"O Rock In Rio anunciou nesta quinta-feira,16, ...",Entretenimento,False,6307,"[15966, 5040, 7982, 7, 273, 6742, 46, 28, 1894...","[638, 1115, 5911, 1, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...
7970,Um levantamento feito pela Confederação Nacion...,Economia,False,8627,"[15966, 5040, 7982, 7, 273, 6742, 46, 292, 107...","[7679, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
7971,"O governador Romeu Zema deu início, nesta quin...",Economia,False,826,"[15966, 5040, 7982, 7, 273, 6742, 46, 28, 1330...","[7679, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
7972,"Nesta segunda-feira (1º), a aplicação da 2ª do...",Geral,False,5800,"[15966, 5040, 7982, 7, 273, 6742, 46, 2343, 36...","[2250, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
7973,"Uma grande quantidade de goiaba, comercializad...",Policial,False,10918,"[15966, 5040, 7982, 7, 273, 6742, 46, 403, 134...","[19258, 77, 1, 0, 0, 0, 0, 0, 0, 0]"


In [None]:
index = 2

content = hf_ds['test'][index]['content']
categories = hf_ds['test'][index]['categories']

prompt = PROMPT_TEMPLATE.format(content=content)

input_ids = tokenizer(prompt, return_tensors="pt").to(original_model.device)


original_model_outputs = original_model.generate(input_ids=input_ids["input_ids"], generation_config=GenerationConfig(max_new_tokens=MAX_LENGTH, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

print(3*"\n")
print(f"INDEX = {index}")
print(100*"-")
print(f'CONTENT:\n{content}')
print(100*"-")
print(f'BASELINE HUMAN SUMMARY:\n{categories}')
print(100*"-")
print(f'INSTRUCT MODEL:\n{original_model_text_output}')
print(100*"-")





INDEX = 2
----------------------------------------------------------------------------------------------------
CONTENT:
Uma das maiores plataformas de mídia social do mundo, completa 15 anos de existência nesta quinta-feira,15. O Twitter! A plataforma permite que os usuários enviem...
----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Geral
----------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
Categorize a seguinte notícia: Uma das maiores plataformas de mídia social do mundo, completa 15 anos de existência nesta quinta-feira,15. O Twitter! A plataforma permite que os usuários enviem... Categoria:
----------------------------------------------------------------------------------------------------


# Treinamento

In [None]:
%%time

output_dir = f'./news-category-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=50
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

trainer.train()



Step,Training Loss
1,15.375
2,5.2812
3,3.1406
4,1.8359
5,1.0312
6,1.0625
7,0.5156
8,0.2012
9,0.3008
10,0.2793


CPU times: user 55.1 s, sys: 20.4 s, total: 1min 15s
Wall time: 1min 22s


TrainOutput(global_step=50, training_loss=0.74810546875, metrics={'train_runtime': 78.7802, 'train_samples_per_second': 5.077, 'train_steps_per_second': 0.635, 'total_flos': 338288640000000.0, 'train_loss': 0.74810546875, 'epoch': 0.05})

In [None]:
# import evaluate

# metric = evaluate.load("glue", "mrpc")

# metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
instruct_model = trainer.model

In [None]:
index = 18

content = hf_ds['test'][index]['content']
categories = hf_ds['test'][index]['categories']

prompt = PROMPT_TEMPLATE.format(content=content)

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")


instruct_model_outputs = instruct_model.generate(input_ids=input_ids["input_ids"], generation_config=GenerationConfig(max_new_tokens=MAX_LENGTH, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(3*"\n")
print(f"INDEX = {index}")
print(100*"-")
print(f'CONTENT:\n{content}')
print(100*"-")
print(f'BASELINE HUMAN SUMMARY:\n{categories}')
print(100*"-")
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(100*"-")





INDEX = 18
----------------------------------------------------------------------------------------------------
CONTENT:
Começa, no próximo dia 15 de março, o período para o envio da Declaração de Imposto de Renda Pessoa Física (DIRPF) 2023 à Receita Federal....
----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Geral
----------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
Geral
----------------------------------------------------------------------------------------------------


In [None]:
trainer.save_model(FINE_TUNED_MODEL_PATH)

# Predictions

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

instruct_model = AutoModelForSeq2SeqLM.from_pretrained(FINE_TUNED_MODEL_PATH, torch_dtype=torch.bfloat16)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [None]:
# %%time

# predictions = trainer.predict(tokenized_datasets["validation"])
# print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
%%time

for index in range(21,40):

    content = hf_ds['test'][index]['content']
    categories = hf_ds['test'][index]['categories']

    prompt = PROMPT_TEMPLATE.format(content=content)

    input_ids = tokenizer(prompt, return_tensors="pt")


    instruct_model_outputs = instruct_model.generate(input_ids=input_ids["input_ids"], generation_config=GenerationConfig(max_new_tokens=MAX_LENGTH, num_beams=1))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    print(3*"\n")
    print(f"INDEX = {index}")
    print(100*"-")
    print(f'CONTENT:\n{content}')
    print(100*"-")
    print(f'BASELINE HUMAN SUMMARY:\n{categories}')
    print(100*"-")
    print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
    print(100*"-")





INDEX = 21
----------------------------------------------------------------------------------------------------
CONTENT:
Dois irmãos foram presos pela Polícia Militar após uma tentativa de homicídio na Zona Sul de Poços de Caldas nesta sexta-feira (7), no Bairro Conjunto...
----------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Policial
----------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
Policial
----------------------------------------------------------------------------------------------------




INDEX = 22
----------------------------------------------------------------------------------------------------
CONTENT:
Começou na sexta-feira,15, o Festival gastronômico São João da Boa Mesa, em São João da Boa Vista (SP).  O festival segue até o dia 31...
-------------------------------------------------------------------------------------------------

In [None]:
print(prompt)


Categorize a seguinte notícia:

Acompanhado do sócio da rádio Onda Poços e presidente da Associação Mineira de Rádio e Televisão (AMIRT), Luciano Pimenta Corrêa Peres, o governador de Minas...

Categoria:



In [None]:
input_ids = tokenizer(prompt, return_tensors="pt")
input_ids

{'input_ids': tensor([[15966,  5040,  7982,     7,   273,  6742,    46, 22770,    53,    10,
          7811,    11,  1527,    28,  1302, 19848,     6,     8,   409,    11,
          1610, 19961,     4,  2779,     8, 11969,    24,    89, 10608,  9285,
            63, 11000, 20719, 10585, 14102,     6,     3,     9,  1330,     4,
          1339,     5,     5,     5,    31,  6086,    46,    31,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

In [None]:
instruct_model_outputs = instruct_model.generate(input_ids=input_ids["input_ids"], generation_config=GenerationConfig(max_new_tokens=MAX_LENGTH, num_beams=1))
instruct_model_outputs

tensor([[   0,   31, 6086,   46,   31,    1]])

In [None]:
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

instruct_model_text_output

'Categoria: '