## Text Classification with BERT




### Criando o ambiente virtual

Digite no terminal os seguintes comandos:

> python -m venv .venv-tcc

> python -m ipykernel install --user --name=.venv-tcc

> pip install ipywidgets


In [25]:
### Instalando as dependências necessárias
# ! pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# ! pip install transformers
# # ! pip install torchviz
# ! pip install scikit-learn
# ! pip install ipywidgets
# Tratamento de dados
# ! pip install -U pip setuptools wheel
# ! pip install -U 'spacy'
# ! python -m spacy download en_core_web_sm

In [26]:
import re
import spacy
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertModel,
    get_linear_schedule_with_warmup,
)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Baixar CSV DAtaset

Caso você queira executar o modelo no colab, você deve baixar o conjunto de dados do IMDB no link abaixo e adiciona-lo a raiz do seu projeto:

<a href="https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews" target="_blank">Conjunto de dados IMDB de 50 mil críticas de filmes</a>


In [27]:
def load_imdb_data(data_path):
    df = pd.read_csv(data_path)
    df["sentiment"] = [
        1 if sentiment == "positive" else 0 for sentiment in df["sentiment"].tolist()
    ]
    return df

In [28]:
def load_texts_labels(df):
    texts = df["review"].tolist()
    labels = df["sentiment"].tolist()
    return texts, labels

In [29]:
DATA_PATH = "D:/tcc2/guilherme/data/IMDB_Dataset_tratado_sem_lemma.csv"
df = load_imdb_data(DATA_PATH)
df = df[:49_500]  # definir tamanho do df para
df = df[:1_000]  # definir tamanho do df para

In [30]:
display(df)

Unnamed: 0,review,sentiment
0,reviewers mentioned watching 1 oz episode hook...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1
...,...,...
995,sacred ask ernie fosselius days everybody vide...,1
996,hated hate self aware pretentious inanity masq...,0
997,usually try professional constructive criticiz...,0
998,like going film history class like school try ...,0


In [31]:
df["review"].size

1000

In [32]:
def maior_text(texts):
    return max(texts, key=lambda x: len(x.split()))

In [33]:
texts, labels = load_texts_labels(df)
maior_string = maior_text(texts)
tam_maior_string = len(maior_string)
print(tam_maior_string)

3643


In [34]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }

In [35]:
class MultiClassClassifier(nn.Module):
    def __init__(self, bert_model_name: str, hidden_size: int, num_classes: int):
        super(MultiClassClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(self.bert.config.hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs_bert = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs_bert.pooler_output
        x = self.dropout(pooled_output)
        logits = self.linear1(x)
        logits = self.linear2(logits)
        probs = self.softmax(logits)
        return probs

In [36]:
def train(model: MultiClassClassifier, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()  # Atualiza os pesos do modelo usando os gradientes calculados.
        scheduler.step()  # Atualiza a taxa de aprendizado. Isso é feito após cada época

In [37]:
def evaluate(model: MultiClassClassifier, data_loader: TextClassificationDataset, device: str):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():  # Desativa o cálculo de gradientes. Durante a avaliação, não precisamos calcular os gradientes, pois não estamos atualizando os pesos do modelo.
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(
                outputs, dim=1
            )  # Obtém as previsões do modelo encontrando o índice do valor máximo na saída do modelo.
            predictions.extend(
                preds.cpu().tolist()
            )  # Adiciona as previsões  a predictions[]
            actual_labels.extend(
                labels.cpu().tolist()
            )  # Adiciona os rótulos reais a actual_labels[].
    return accuracy_score(actual_labels, predictions), classification_report(
        actual_labels, predictions
    )

In [38]:
def predict_tratamento_texto(text: str, lemma:bool):
    nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(re.compile("<.*?>"), "", text)
    doc = nlp(text)
    if lemma == True:
        text = " ".join(
            [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        )  # retorna o lemma
    else:
        text = " ".join(
            [token.text for token in doc if not token.is_stop and not token.is_punct]
        )  # return text
    return text

In [39]:
def predict_sentiment(
    text: str, model: MultiClassClassifier, tokenizer: BertTokenizer, device: str, max_length: int
):
    model.eval()
    text = predict_tratamento_texto(text, lemma=False)
    encoding = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        prob = model(input_ids=input_ids, attention_mask=attention_mask)

    prob_neg = prob[0, 0].item()
    prob_pos = prob[0, 1].item()
    print(f"prob negativa: {prob_neg:.5f}")
    print(f"prob positiva: {prob_pos:.5f}")

    if prob_neg >= 0.7 and prob_pos < 0.6:
        return "negative"
    elif prob_neg < 0.6 and prob_pos >= 0.7:
        return "positive"
    else:
        return "neutral"

In [40]:
# Set up parameters
BERT_MODEL_NAME = "bert-base-uncased"
hidden_size = 10 #Tamanho oculto, neuronios
num_classes = 2 # numero de neuronios saida/ categorias
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [41]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [42]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
train_dataset = TextClassificationDataset(
    train_texts, train_labels, tokenizer, max_length
)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiClassClassifier(BERT_MODEL_NAME,hidden_size, num_classes).to(device)

In [44]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
#: Isso configura o agendador de taxa de aprendizado. O agendador ajusta a taxa de aprendizado ao longo do treinamento. Neste caso, a taxa de aprendizado aumentará linearmente por um número de etapas de aquecimento e, em seguida, diminuirá linearmente.
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

In [46]:
# Equivalente a Funcao fit()
for epoch in tqdm(range(num_epochs), colour="green", desc="Progresso: "):

    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Progresso:   0%|[32m          [0m| 0/4 [00:00<?, ?it/s]

Epoch 1/4


Progresso:  25%|[32m██▌       [0m| 1/4 [07:02<21:07, 422.44s/it]

Validation Accuracy: 0.7600
              precision    recall  f1-score   support

           0       0.78      0.75      0.76       104
           1       0.74      0.77      0.76        96

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200

Epoch 2/4


Progresso:  50%|[32m█████     [0m| 2/4 [14:02<14:01, 420.90s/it]

Validation Accuracy: 0.8500
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       104
           1       0.86      0.82      0.84        96

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200

Epoch 3/4


Progresso:  75%|[32m███████▌  [0m| 3/4 [21:03<07:01, 421.01s/it]

Validation Accuracy: 0.8550
              precision    recall  f1-score   support

           0       0.80      0.97      0.87       104
           1       0.96      0.73      0.83        96

    accuracy                           0.85       200
   macro avg       0.88      0.85      0.85       200
weighted avg       0.87      0.85      0.85       200

Epoch 4/4


Progresso: 100%|[32m██████████[0m| 4/4 [28:06<00:00, 421.52s/it]

Validation Accuracy: 0.8550
              precision    recall  f1-score   support

           0       0.80      0.95      0.87       104
           1       0.94      0.75      0.83        96

    accuracy                           0.85       200
   macro avg       0.87      0.85      0.85       200
weighted avg       0.87      0.85      0.85       200






## Saving & Loading Model for Inference

Save:


In [None]:
# PATH_MODEL_SAVE = "sentiment_classifier_Bert_IMDB_Dataset_sem_lemma.pth"
# torch.save(model.state_dict(), PATH_MODEL_SAVE)

load:


In [None]:
# # PATH_MODEL_SAVE = "sentiment_classifier_en_49500_reviews.pth"
# saved_model = MultiClassClassifier(BERT_MODEL_NAME,hidden_size, num_classes).to(device)
# saved_model.load_state_dict(torch.load(PATH_MODEL_SAVE))
# # saved_model.eval()

## Sentiment forecast

In [None]:
# Capita marvel 2019 5/10
text = """
    Plot

Carol Danvers     becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races.

Cast

Brie Larson, Samuel L. Jackson (Because duh), Jude "Just consistently dreadful" Law, Annette Bening, Djimon Hounsou, Clark Gregg and blink and you'll miss him Lee Pace who returns as Ronan but looks so different I didn't even think it was him.

Verdict

I watched this back when it was initially released, I watched it a second time a few days ago as the missus is wanting us to binge watch the entire MCU as she's very behind. My opinion has changed on the 2nd viewing and not in a good way, my rating has shifted from a 6/10 to 5/10.

You see straight out of the gate the first thing you notice about Carol Danvers is she's not really a character you can get behind. She's not funny, she's not entertaining, she comes across as a surly teenage girl who is just upset at the slightest thing and just doesn't want to be there. This is not a character you can build a movie around, like trying to make a teenage Groot movie! It wouldn't work, but he's okay as a side character.

Supporting cast are also hit and miss, Jackson and Gregg are great, but Lynch and Law just stink up every scene they're in.

I'm a Marvel fan but I recognize where it's weak, this is a distinctly average film that serves as a standalone origin story and doesn't contribute much to the universe as a whole.

Rants

I remember when the movie came out all the controversy with Brie Larson, I just had to Google what the controversy even was as I don't remember due to not focusing on such things. Now I can't really get a definitive answer. From what I see it's a combination of people not liking her attitude and her comments on feminism. So I Googled further to see what she said, she came across arrogant in them and a smidge out of touch but none of it explained the overwhelming hate I've seen aimed at her. Then I remembered that people talk about all the different types of bigotry but misogyny rarely comes up, I remembered that it's visibly increased over the past decade and appreciated why she's been targeted. News flash, the outspoken loud brash man hating femnists you likely thing of when you hear that word make up a very small percentage. Feminism is good, if you disagree I hope you simply don't know the meaning of the word.

The Good

Jackson and Gregg Has a couple of decent moments Not a bad soundtrack Goose!

The Bad

Larson isn't great Law and Lynch are terrible Lead just comes across unlikable.

Overall just a weak entry to the MC
"""

In [None]:
# CApita marvel 2019 6/10
text="""Mediocre Marvel is still pretty good.

My first review in a long time! Dont know why I decided to write about this movie.

I agree with most mediocre reviews I read here.

The pacing was pretty good.

Most of the action was good!

The story was ok and had some good twists.

I thought about giving this movie a 6 but after letting it sink in I decided to give it 8 out of 10

It entertained me and my company from beginning to end.

There were some eye rolling moments but they are easily forgiven."""

In [None]:
# Test sentiment prediction
# test_text = " very perfect very good very bad bad well, "
sentiment = predict_sentiment(text, model, tokenizer, device, max_length)
# sentiment2 = predict_sentiment(text, saved_model, tokenizer, device, max_length)

print(predict_tratamento_texto(text,lemma= False))
print(f"Predicted sentiment: {sentiment}")
# print(f"Predicted sentiment: {sentiment2}")

prob negativa: 0.00002
prob positiva: 0.99998
mediocre marvel pretty good review long time nt know decided write movie agree mediocre reviews read pacing pretty good action good story ok good twists thought giving movie 6 letting sink decided 8 10 entertained company beginning end eye rolling moments easily forgiven
Predicted sentiment: positive


In [None]:
# Test sentiment prediction
test_text = "The movie was so bad and I would not recommend it to anyone."
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print("Texto: ", test_text)
print(f"Predicted sentiment: {sentiment}")

prob negativa: 0.99997
prob positiva: 0.00003
Texto:  The movie was so bad and I would not recommend it to anyone.
Predicted sentiment: negative


In [None]:
# Test sentiment prediction
test_text = "Best movie of the year. "
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print("Worst movie of the year.")
print(f"Predicted sentiment: {sentiment}")

prob negativa: 0.00002
prob positiva: 0.99998
Worst movie of the year.
Predicted sentiment: positive


In [None]:
# Test sentiment prediction
test_text = "This movie is more or less very hungry"
sentiment = predict_sentiment(test_text, model, tokenizer, device, max_length)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

prob negativa: 0.57460
prob positiva: 0.42540
This movie is more or less very hungry
Predicted sentiment: neutral


### Listar tensores na GPU e CPU


In [None]:
import torch
import gc


# Função para listar tensores na GPU
def get_tensors_in_gpu():
    print("\nTensores na GPU:")
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            if obj.is_cuda:
                print(type(obj), obj.size(), obj.device)
# Listar tensores na GPU
get_tensors_in_gpu()
torch.cuda.empty_cache()



Tensores na GPU:
<class 'torch.nn.parameter.Parameter'> torch.Size([10, 768]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([10]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([2, 10]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([2]) cuda:0
<class 'torch.Tensor'> torch.Size([30522, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([30522, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([512, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([512, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([2, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([2, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([768]) cuda:0
<class 'torch.Tensor'> torch.Size([768]) cuda:0
<class 'torch.Tensor'> torch.Size([768]) cuda:0
<class 'torch.Tensor'> torch.Size([768]) cuda:0
<class 'torch.Tensor'> torch.Size([768, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([768, 768]) cuda:0
<class 'torch.Tensor'> torch.Size([768]) cuda:0
<class 'torch.Tensor'> torch.Size([768]) cuda:0
<class 'tor



In [None]:
# # Função para listar tensores na CPU
# def get_tensors_in_cpu():
#     print("\nTensores na CPU:")
#     for obj in gc.get_objects():
#         if torch.is_tensor(obj):
#             if not obj.is_cuda:
#                 print(type(obj), obj.size(), obj.device)

# # Listar tensores na CPU
# get_tensors_in_cpu()