## Text Classification with BERT

Referencia

<h4>https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b</h4>


### Criando o ambiente virtual

Digite no terminal os seguintes comandos:

> python -m venv .venv-tcc

> python -m ipykernel install --user --name=.venv-tcc

> pip install ipywidgets

In [1]:
### Instalando as dependências necessárias
# ! pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# ! pip install transformers
# # ! pip install torchviz
# ! pip install scikit-learn
# ! pip install ipywidgets
# Tratamento de dados
# ! pip install -U pip setuptools wheel
# ! pip install -U 'spacy[cuda12x,transformers,lookups]'
# ! python -m spacy download en_core_web_trf


In [2]:
# import os
import re
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertModel,
    get_linear_schedule_with_warmup,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

### Baixar CSV DAtaset

Caso você queira executar o modelo no colab, você deve baixar o conjunto de dados do IMDB no link abaixo e adiciona-lo a raiz do seu projeto:

<a href="https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews" target="_blank">Conjunto de dados IMDB de 50 mil críticas de filmes</a>


### Removing HTML tags using regular expresions

In [3]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'),"",raw_text)
    return cleaned_text

In [4]:
def load_imdb_data(data_path):
    df = pd.read_csv(data_path)
    
    df["sentiment"] = [
        1 if sentiment == "positive" else 0 for sentiment in df["sentiment"].tolist()
    ]
    return df

In [5]:
def load_texts_labels(df):
    texts = df["review"].tolist()
    labels = df["sentiment"].tolist()
    return texts, labels

In [6]:
data_path = "IMDB_Dataset_tratado_lemma.csv"
df = load_imdb_data(data_path)
df['review'] = df["review"].apply(remove_tags)


In [7]:
display(df)

Unnamed: 0,review,sentiment
0,reviewer mention watch 1 oz episode hook right...,1
1,wonderful little production film technique una...,1
2,think wonderful way spend time hot summer week...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1
...,...,...
49577,think movie right good job creative original e...,1
49578,bad plot bad dialogue bad act idiotic directin...,0
49579,catholic teach parochial elementary school nun...,0
49580,go disagree previous comment maltin second rat...,0


In [8]:
df = df[:49500]

In [9]:
df['review'].size

49500

In [10]:
texts, labels = load_texts_labels(df)
maior_string = max(texts, key=lambda x: len(x.split()))
tam_maior_string= len(maior_string)
print(tam_maior_string)

8004


In [11]:
# texts, labels = texts[:100], labels[:100]

In [12]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }

In [13]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name: str, num_classes: int):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(self.bert.config.hidden_size, 10)
        self.linear2 = nn.Linear(10, 3)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # Passando os dados de entrada pelo modelo BERT
        outputs_bert = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Obtendo a representação do pooler output, que captura o contexto da sequência de entrada
        pooled_output = outputs_bert.pooler_output
        # Aplicando dropout à representação do pooler output
        x = self.dropout(pooled_output)
        logits = self.linear1(x)
        logits_out = self.linear2(logits)
        # saida = self.softmax(logits_out)
        return logits_out

In [14]:
def train(model: BertModel, data_loader, optimizer, scheduler, device):
    # Define modelo no modo de treinamento .  Isso é necessário porque certas camadas como Dropout e BatchNorm se comportam de maneira diferente durante o treinamento.
    model.train()

#batch: dict ->{'input_ids': tensor([[...]]),
# 'attention_mask':tensor([[...]]),
# 'label':tensor([[...]]}
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()  # Atualiza os pesos do modelo usando os gradientes calculados.
        scheduler.step()  # Atualiza a taxa de aprendizado. Isso é feito após cada época

    

In [15]:
def evaluate(model: BertModel, data_loader: TextClassificationDataset, device: str):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():  # Desativa o cálculo de gradientes. Durante a avaliação, não precisamos calcular os gradientes, pois não estamos atualizando os pesos do modelo.
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(
                outputs, dim=1
            )  # Obtém as previsões do modelo encontrando o índice do valor máximo na saída do modelo.
            predictions.extend(
                preds.cpu().tolist()
            )  # Adiciona as previsões  a predictions[]
            actual_labels.extend(
                labels.cpu().tolist()
            )  # Adiciona os rótulos reais a actual_labels[].
    return accuracy_score(actual_labels, predictions), classification_report(
        actual_labels, predictions
    )

In [63]:
import spacy


def predict_tratamento_texto(text: str):
    nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
    doc = nlp(text)
    text = text.lower()
    text = re.sub(" +", " ", text).strip()
    text = re.sub(re.compile("<.*?>"), "", text)
    text = " ".join(
        [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    )
    return text

In [60]:
def predict_sentiment(
    text: str, model: BertModel, tokenizer: BertTokenizer, device: str, max_length: int
):
    r"""é usada após a fase de ajuste(fit) do modelo,
    quando você deseja fazer previsões/inferências em novos textos que o modelo não viu durante o treinamento.
    """

    model.eval()
    text = predict_tratamento_texto(text)
    encoding = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    print("preds: ", preds)
    print(outputs)

    print(preds.item())

    return "positive" if preds.item() == 1 else "negative"

In [34]:
# Set up parameters
bert_model_name = "bert-base-uncased"
num_classes = 10  # numero de neuronios
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [35]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [36]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(
    train_texts, train_labels, tokenizer, max_length
)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [38]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)  #: Isso configura o agendador de taxa de aprendizado. O agendador ajusta a taxa de aprendizado ao longo do treinamento. Neste caso, a taxa de aprendizado aumentará linearmente por um número de etapas de aquecimento e, em seguida, diminuirá linearmente.

In [None]:
#Equivalente a Funcao fit() do keras e Scikit-learn
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)
    

## Saving & Loading Model for Inference

Save:

In [None]:
# PATH_MODEL_SAVE = "sentiment_classifier_en_49500_reviews.pth"
# torch.save(model.state_dict(), "sentiment_classifier_en_49500_reviews.pth")

load:

In [61]:
PATH_MODEL_SAVE = "sentiment_classifier_en_49500_reviews.pth"
model = BERTClassifier(bert_model_name, num_classes).to(device)
model.load_state_dict(torch.load(PATH_MODEL_SAVE))
model.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [64]:
# Test sentiment prediction
test_text = "The animation BAD "
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

preds:  tensor([0], device='cuda:0')
tensor([[ 4.7905, -1.4969, -4.3061]], device='cuda:0')
0
The animation BAD 
Predicted sentiment: negative


In [None]:
# Test sentiment prediction
test_text = "The movie was so bad and I would not recommend it to anyone."
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print("Texto: ", test_text)
print(f"Predicted sentiment: {sentiment}")

torch.max(outputs, dim=1):  torch.return_types.max(
values=tensor([4.5734], device='cuda:0'),
indices=tensor([0], device='cuda:0'))
tensor([[ 4.5734, -1.3224, -4.3466]], device='cuda:0')
0
Texto:  The movie was so bad and I would not recommend it to anyone.
Predicted sentiment: negative


In [None]:
# Test sentiment prediction
test_text = "Best movie of the year."
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print("Worst movie of the year.")
print(f"Predicted sentiment: {sentiment}")

torch.max(outputs, dim=1):  torch.return_types.max(
values=tensor([4.1800], device='cuda:0'),
indices=tensor([1], device='cuda:0'))
tensor([[-1.5372,  4.1800, -4.8720]], device='cuda:0')
1
Worst movie of the year.
Predicted sentiment: positive


In [None]:
# Test sentiment prediction
test_text = "This movie is more or less"
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

torch.max(outputs, dim=1):  torch.return_types.max(
values=tensor([1.7780], device='cuda:0'),
indices=tensor([0], device='cuda:0'))
tensor([[ 1.7780,  1.2914, -4.6140]], device='cuda:0')
0
This movie is more or less
Predicted sentiment: negative


In [41]:
#  apenas para quando der erro de alocacao de tensores e dispositivos diferentes

import torch # type: ignore
import gc


# Função para listar tensores na GPU
def get_tensors_in_gpu():
    print("\nTensores na GPU:")
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            if obj.is_cuda:
                print(type(obj), obj.size(), obj.device)


# Função para listar tensores na CPU
def get_tensors_in_cpu():
    print("\nTensores na CPU:")
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            if not obj.is_cuda:
                print(type(obj), obj.size(), obj.device)

In [42]:
# Listar tensores na GPU
get_tensors_in_gpu()
torch.cuda.empty_cache()


Tensores na GPU:
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([3072]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([3072]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([768



In [43]:
# Listar tensores na CPU

get_tensors_in_cpu()



Tensores na CPU:
<class 'torch.nested._internal.nested_tensor.NestedTensor'> torch.Size([1, j0, 3]) meta
<class 'torch.Tensor'> torch.Size([3, 3]) meta
<class 'torch.Tensor'> torch.Size([2]) meta
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<class 'torch.Tensor'> torch.Size([]) cpu
<clas