In [1]:
# import os
import re
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertModel,
    get_linear_schedule_with_warmup,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import spacy

In [2]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }

In [3]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name: str, num_classes: int):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(self.bert.config.hidden_size, 10)
        self.linear2 = nn.Linear(10, 3)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # Passando os dados de entrada pelo modelo BERT
        outputs_bert = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Obtendo a representação do pooler output, que captura o contexto da sequência de entrada
        pooled_output = outputs_bert.pooler_output
        # Aplicando dropout à representação do pooler output
        x = self.dropout(pooled_output)
        logits = self.linear1(x)
        logits_out = self.linear2(logits)
        # saida = self.softmax(logits_out)
        return logits_out

In [4]:
def train(model: BertModel, data_loader, optimizer, scheduler, device):
    # Define modelo no modo de treinamento .  Isso é necessário porque certas camadas como Dropout e BatchNorm se comportam de maneira diferente durante o treinamento.
    model.train()

#batch: dict ->{'input_ids': tensor([[...]]),
# 'attention_mask':tensor([[...]]),
# 'label':tensor([[...]]}
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()  # Atualiza os pesos do modelo usando os gradientes calculados.
        scheduler.step()  # Atualiza a taxa de aprendizado. Isso é feito após cada época

    

In [5]:
def evaluate(model: BertModel, data_loader: TextClassificationDataset, device: str):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():  # Desativa o cálculo de gradientes. Durante a avaliação, não precisamos calcular os gradientes, pois não estamos atualizando os pesos do modelo.
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(
                outputs, dim=1
            )  # Obtém as previsões do modelo encontrando o índice do valor máximo na saída do modelo.
            predictions.extend(
                preds.cpu().tolist()
            )  # Adiciona as previsões  a predictions[]
            actual_labels.extend(
                labels.cpu().tolist()
            )  # Adiciona os rótulos reais a actual_labels[].
    return accuracy_score(actual_labels, predictions), classification_report(
        actual_labels, predictions
    )

In [6]:


def predict_tratamento_texto(text: str):
    nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
    doc = nlp(text)
    text = text.lower()
    text = re.sub(" +", " ", text).strip()
    text = re.sub(re.compile("<.*?>"), "", text)
    text = " ".join(
        [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    )
    return text

In [7]:
def predict_sentiment(
    text: str, model: BertModel, tokenizer: BertTokenizer, device: str, max_length: int
):
    r"""é usada após a fase de ajuste(fit) do modelo,
    quando você deseja fazer previsões/inferências em novos textos que o modelo não viu durante o treinamento.
    """

    model.eval()
    text = predict_tratamento_texto(text)
    encoding = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        #OUTPUTS POSSUI 3 SAIDAS
        _, preds = torch.max(outputs, dim=1)

    print("preds: ", preds)
    print(outputs)

    print(preds.item())

    return "positive" if preds.item() == 1 else "negative"

In [24]:
def predict_sentiment(
    text: str, model: BertModel, tokenizer: BertTokenizer, device: str, max_length: int
):
    model.eval()
    text = predict_tratamento_texto(text)
    encoding = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    if preds.item() == 1:
        return "positive"
    elif preds.item() == 0:
        return "negative"
    else:
        return "neutral"

In [8]:
# Set up parameters
bert_model_name = "bert-base-uncased"
num_classes = 10  # numero de neuronios
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [9]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [21]:
PATH_MODEL_SAVE = "sentiment_classifier_en_49500_reviews.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)
model.load_state_dict(torch.load(PATH_MODEL_SAVE))

<All keys matched successfully>

In [12]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
# scheduler = get_linear_schedule_with_warmup(
#     optimizer, num_warmup_steps=0, num_training_steps=total_steps
# )

In [26]:
# Capitã Marvel (2019) nota 3/10
test_text = f'I am a Marvel fan. I go see all the movies, then buy the blu-rays. I have character statues, posters and have read the comics including Captain Marvel. There is a reason people become fans of certain things, and those things done well over time generate more fans until it becomes a "Juggernaut" like the MCU has. I fear that it\'s not unstoppable however, as the sub-par Captain Marvel has painfully illustrated. The movie is a pastiche of poorly written scenes with a bit of uninspired action sprinkled here and there with barely any plot at all. Carol Danvers has no character development whatsoever, no adversity to overcome besides amnesia and no villain to fight. I know there are people saying they like this movie, but I feel like they either aren\'t being honest or they like it for personal reasons that are not present in the movie. Objectively speaking, in terms of the technical aspects of storytelling, Captain Marvel just isn\'t effective. Subjectively, I was bored the entire time and the nostalgia parts made me feel as though JJ Abrams was poking me in the ribs asking me if I \'member the 90\'s. Yes. Yes I do. And for the record, people are criticizing this movie because it was bad, not because they "haaaate the wooomans!" Dismissing legitimate criticisms using this tactic will not produce better films, which is the only thing most people want. The problem is that they built the movie around the idea of "strong woman" and promoted it as an identity film, so when the movie ends up being bad some people rush to defend it because they don\'t want the IDEA to fail. The movie itself is secondary - just a vehicle to slap their bumper stickers on, which is a real shame because this could have been a great addition to the MCU. I hope they learn something from this, but from what I\'ve heard it appears that Marvel actually plans to escalate the identity politics in the next phase which would be an absolute disaster for them. When pointing to the box office as a defense for how "good" Captain Marvel was, just remember that tons of paying customers didn\'t like it at all.'
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

I am a Marvel fan. I go see all the movies, then buy the blu-rays. I have character statues, posters and have read the comics including Captain Marvel. There is a reason people become fans of certain things, and those things done well over time generate more fans until it becomes a "Juggernaut" like the MCU has. I fear that it's not unstoppable however, as the sub-par Captain Marvel has painfully illustrated. The movie is a pastiche of poorly written scenes with a bit of uninspired action sprinkled here and there with barely any plot at all. Carol Danvers has no character development whatsoever, no adversity to overcome besides amnesia and no villain to fight. I know there are people saying they like this movie, but I feel like they either aren't being honest or they like it for personal reasons that are not present in the movie. Objectively speaking, in terms of the technical aspects of storytelling, Captain Marvel just isn't effective. Subjectively, I was bored the entire time and th

In [25]:
# Capitã Marvel (2019) nota 10/10
test_text = """I loved every moment of this film. We speak so much about representation and here it is in a beautifully wrapped box. Where was this hitter when I was growing up in southwest Virginia? Not in my library I assure you. And yes, I know this isn't the real origin story. But I needed this movie. I am 41 years old. It shouldn't have taken this long."""
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

I loved every moment of this film. We speak so much about representation and here it is in a beautifully wrapped box. Where was this hitter when I was growing up in southwest Virginia? Not in my library I assure you. And yes, I know this isn't the real origin story. But I needed this movie. I am 41 years old. It shouldn't have taken this long.
Predicted sentiment: positive
