## Dependências

In [1]:
import os
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer

## Parâmetros

In [2]:
MAX_LENGTH = 200
PROJECT_PATH = os.getcwd()
RAW_DATA_PATH = os.path.join(PROJECT_PATH, "data", "processed", "processed-pocos-news.csv")

PROMPT_TEMPLATE = """
Categorize a seguinte notícia:

{content}

Categoria:
"""

BASE_MODEL = 'unicamp-dl/ptt5-large-portuguese-vocab'


FINE_TUNED_MODEL_PATH = os.path.join(PROJECT_PATH, "models", "finetune", "finetuned-"+BASE_MODEL.split("/")[-1])

## Funções

In [3]:
def ttv_split(ds, train_ratio, test_ratio):
    valid_ratio = 1 - train_ratio - test_ratio
    intermediate = ds.train_test_split(train_size=train_ratio)
    test_and_valid = intermediate['test']
    final = test_and_valid.train_test_split(train_size=test_ratio / (test_ratio + valid_ratio))

    return DatasetDict({
        'train': intermediate['train'],
        'test': final['train'],
        'validation': final['test']})

def convert_to_huggingdface_dataset(df):
    df_to_train = df[~df['predict']]
    df_to_predict = df[df['predict']]
    hf_ds = Dataset.from_pandas(df_to_train)
    hf_ds = ttv_split(hf_ds, train_ratio = 0.85, test_ratio=0.10)
    hf_ds['predict'] = Dataset.from_pandas(df_to_predict)
    return hf_ds


def tokenize_function(example: dict) -> dict:

    prompt = [PROMPT_TEMPLATE.format(content=content) for content in example["content"]]

    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=MAX_LENGTH
    ).input_ids

    example['labels'] = tokenizer(
        example["categories"],
        padding="max_length", truncation=True,
        return_tensors="pt",
        max_length=30
    ).input_ids

    return example

In [4]:
class NewsCategoryPredictor:

    def __init__(self, model_path, device, torch_dtype=torch.bfloat16, max_new_tokens=MAX_LENGTH, num_beams=1, skip_special_tokens=True):
        self.model_path = model_path
        self.device = device
        self.torch_dtype = torch_dtype
        self.max_new_tokens = max_new_tokens
        self.num_beams = num_beams
        self.skip_special_tokens = skip_special_tokens
        self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
        self.instruct_model = AutoModelForSeq2SeqLM.from_pretrained(
            model_path,
            torch_dtype=torch_dtype
        ).to(device)


    def predict(self, content):
        prompt = PROMPT_TEMPLATE.format(content=content)
        input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        
        instruct_model_outputs = self.instruct_model.generate(
            input_ids=input_ids["input_ids"],
            generation_config=GenerationConfig(max_new_tokens=self.max_new_tokens, num_beams=self.num_beams)
        )
        instruct_model_text_output = self.tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=self.skip_special_tokens)
        return instruct_model_text_output

### Dataset

In [5]:
df = pd.read_csv(RAW_DATA_PATH).drop(columns=["Unnamed: 0", "id"], errors="ignore")

hf_ds = convert_to_huggingdface_dataset(df)

hf_ds_predict = hf_ds['predict']

del(hf_ds['predict'])

hf_ds

DatasetDict({
    train: Dataset({
        features: ['content', 'categories', 'predict', '__index_level_0__'],
        num_rows: 7975
    })
    test: Dataset({
        features: ['content', 'categories', 'predict', '__index_level_0__'],
        num_rows: 938
    })
    validation: Dataset({
        features: ['content', 'categories', 'predict', '__index_level_0__'],
        num_rows: 470
    })
})

### Model

In [None]:
model = NewsCategoryPredictor(model_path=FINE_TUNED_MODEL_PATH, device="cuda")

## Predictions

In [11]:
# 

# model_name = BASE_MODEL

# original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

# tokenizer = T5Tokenizer.from_pretrained(model_name)

In [20]:
%%time



for _ in range(10):

    index = int(np.random.choice(range(hf_ds['test'].num_rows)))
    content = hf_ds['test'][index]['content']
    categories = hf_ds['test'][index]['categories']
    instruct_model_text_output = model.predict(content)

    print(3*"\n")
    print(f"INDEX = {index}")
    print(100*"-")
    print(f'CONTENT:\n{content}')
    print(100*"-")
    print(f'BASELINE HUMAN SUMMARY:\n{categories}')
    print(100*"-")
    print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
    print(100*"-")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.81 GiB total capacity; 3.67 GiB already allocated; 1.12 MiB free; 3.68 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF