# Fine-Tuning BERT for Sentiment Analysis

This script fine-tunes a pre-trained BERT model for sentiment analysis using the HuggingFace Transformers library.

### Load Essential Libraries

In [None]:
# Pytorch 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

### Verifying GPU

In [None]:
print(torch.__version__)

In [None]:
if torch.cuda.is_available():
    print(f"GPU disponível: {torch.cuda.get_device_name(0)}")
    # Limpa o cache da GPU
    torch.cuda.empty_cache()
else:
    print("GPU não disponível. Usando CPU.")

In [None]:
# Sklearn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
# Heading the BERT model and tokenizer from Hugging Face Transformers

from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    get_linear_schedule_with_warmup
)

# Data loading library
from datasets import load_dataset

In [None]:
# Data handling and visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

In [None]:
# Data manipulation libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
from datetime import datetime

### Track Training and Test with TensorBoard

In [None]:
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir="../logs/runs/sentiment_analysis")

In [None]:
%load_ext tensorboard
%tensorboard --logdir="../logs/runs/sentiment_analysis"

## Download Dataset

In [None]:
# Load the dataset to be used for training and evaluation
# This dataset is a collection of B2W reviews, which will be used for sentiment analysis
# Note: Ensure that the dataset is available in the Hugging Face datasets library
DATASET_HF = "ruanchaves/b2w-reviews01"

dataset = load_dataset(DATASET_HF)

In [None]:
# Verify the dataset structure and contents

dataset

## Class and Functions

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Preprocessamento: conversão para minúsculas
        text = text.lower()
        
        # Tokenização com padding, truncamento e máscara de atenção
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class BERTSentimentTrainer:
    def __init__(self, model_name='neuralmind/bert-base-portuguese-cased', max_length=128, num_labels=2
                 , dataset_name="ruanchaves/b2w-reviews01"):       
        self.model_name = model_name
        self.max_length = max_length
        self.num_labels = num_labels
        self.dataset_name = dataset_name
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Inicializar tokenizer e modelo, carregando todos os pesos pré-treinados
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=num_labels
        ).to(self.device)
        
        # Métricas de treinamento
        self.train_losses = []
        self.val_losses = []
        self.train_accuracies = []
        self.val_accuracies = []
        
        print(f"Modelo carregado: {model_name}")
        print(f"Dispositivo: {self.device}")
        print(f"Dataset configurado: {self.dataset_name}")
    
    def load_data(self):
        """Carrega e processa o dataset B2W Reviews"""
        print(f"Carregando dataset: {self.dataset_name}...")
        
        # Carregar dataset do Hugging Face
        dataset = load_dataset(self.dataset_name)
        
        # Converter para DataFrame (só tem split train)
        all_data = pd.DataFrame(dataset['train'])
        
        # Assumindo que as colunas são 'text' e 'label'
        texts = all_data['review_text'].tolist()
        labels = all_data['overall_rating'].tolist()
        #recommend_to_a_friend
        
        # Converter ratings para classificação binária (1-2: negativo=0, 3-5: positivo=1)
        binary_labels = [1 if rating >= 3 else 0 for rating in labels]
        
        print(f"Total de amostras: {len(texts)}")
        print(f"Distribuição de classes: {pd.Series(binary_labels).value_counts()}")
        
        return texts, binary_labels
    
    
    def prepare_data(self, texts, labels, test_size=0.2, val_size=0.1, 
                     seed_random_state=None):
        """Prepara os dados para treinamento"""
        # Primeiro, separa o teste
        X_temp, X_test, y_temp, y_test = train_test_split(
            texts, labels, test_size=test_size, random_state=seed_random_state, stratify=labels)
    
        # Agora calcula o percentual real de validação sobre o restante
        val_relative = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_relative, random_state=seed_random_state, stratify=y_temp)

        # Criar datasets
        train_dataset = SentimentDataset(X_train, y_train, self.tokenizer, self.max_length)
        val_dataset = SentimentDataset(X_val, y_val, self.tokenizer, self.max_length)
        test_dataset = SentimentDataset(X_test, y_test, self.tokenizer, self.max_length)
        
        return train_dataset, val_dataset, test_dataset

    
    def train(self, train_dataset, val_dataset, batch_size=16, epochs=3, learning_rate=2e-5):
        """Treina o modelo BERT"""
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        # Otimizador e scheduler
        optimizer = AdamW(self.model.parameters(), lr=learning_rate, weight_decay=0.01)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
        
        print(f"Iniciando treinamento por {epochs} épocas...")
        
        for epoch in range(epochs):
            print(f"\nÉpoca {epoch + 1}/{epochs}")
            
            # Treinamento
            self.model.train()
            total_train_loss = 0
            train_predictions = []
            train_true_labels = []
            
            train_pbar = tqdm(train_loader, desc="Treinamento")
            for batch in train_pbar:
                optimizer.zero_grad()
                
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_train_loss += loss.item()
                
                # Backward pass
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                
                # Métricas
                predictions = torch.argmax(outputs.logits, dim=-1)
                train_predictions.extend(predictions.cpu().numpy())
                train_true_labels.extend(labels.cpu().numpy())
                
                train_pbar.set_postfix({'loss': loss.item()})
            
            avg_train_loss = total_train_loss / len(train_loader)
            train_accuracy = accuracy_score(train_true_labels, train_predictions)
            
            # Validação
            val_loss, val_accuracy, val_metrics = self.evaluate(val_loader)
            
            # Salvar métricas
            self.train_losses.append(avg_train_loss)
            self.val_losses.append(val_loss)
            self.train_accuracies.append(train_accuracy)
            self.val_accuracies.append(val_accuracy)

            # Registrar métricas no TensorBoard
            writer.add_scalar("Loss/Treino", avg_train_loss, epoch + 1)
            writer.add_scalar("Loss/Validação", val_loss, epoch + 1)
            writer.add_scalar("Acurácia/Treino", train_accuracy, epoch + 1)
            writer.add_scalar("Acurácia/Validação", val_accuracy, epoch + 1)
            
            print(f"Loss de treino: {avg_train_loss:.4f}")
            print(f"Acurácia de treino: {train_accuracy:.4f}")
            print(f"Loss de validação: {val_loss:.4f}")
            print(f"Acurácia de validação: {val_accuracy:.4f}")
    
    def evaluate(self, data_loader):
        """Avalia o modelo"""
        self.model.eval()
        total_loss = 0
        predictions = []
        true_labels = []
        logits_list = []
        
        with torch.no_grad():
            for batch in tqdm(data_loader, desc="Avaliação"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                total_loss += outputs.loss.item()
                
                logits = outputs.logits
                batch_predictions = torch.argmax(logits, dim=-1)
                
                predictions.extend(batch_predictions.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())
                logits_list.extend(torch.softmax(logits, dim=-1).cpu().numpy())
        
        avg_loss = total_loss / len(data_loader)
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predictions, average='weighted'
        )
        
        # AUC-ROC
        probabilities = np.array(logits_list)[:, 1]  # Probabilidade da classe positiva
        auc_roc = roc_auc_score(true_labels, probabilities)
        
        metrics = {
            'loss': avg_loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc_roc': auc_roc
        }
        
        return avg_loss, accuracy, metrics
    
    def save_model(self, save_path, metrics, hyperparameters):
        """Salva o modelo e metadados"""
        os.makedirs(save_path, exist_ok=True)
        
        # Salvar modelo e tokenizer
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        
        # Salvar métricas e hiperparâmetros
        metadata = {
            'timestamp': datetime.now().isoformat(),
            'model_name': self.model_name,
            'max_length': self.max_length,
            'hyperparameters': hyperparameters,
            'metrics': metrics,
            'device': str(self.device)
        }
        
        with open(os.path.join(save_path, 'metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=2)
        
        print(f"Modelo salvo em: {save_path}")
    
    def load_model(self, load_path):
        """Carrega modelo salvo"""
        self.model = BertForSequenceClassification.from_pretrained(load_path).to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(load_path)
        
        with open(os.path.join(load_path, 'metadata.json'), 'r') as f:
            metadata = json.load(f)
        
        return metadata
    
    def plot_training_history(self, save_path=None):
        """Plota histórico de treinamento"""
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Loss
        axes[0].plot(self.train_losses, label='Treino')
        axes[0].plot(self.val_losses, label='Validação')
        axes[0].set_title('Loss durante o Treinamento')
        axes[0].set_xlabel('Época')
        axes[0].set_ylabel('Loss')
        axes[0].legend()
        
        # Acurácia
        axes[1].plot(self.train_accuracies, label='Treino')
        axes[1].plot(self.val_accuracies, label='Validação')
        axes[1].set_title('Acurácia durante o Treinamento')
        axes[1].set_xlabel('Época')
        axes[1].set_ylabel('Acurácia')
        axes[1].legend()
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(os.path.join(save_path, 'training_history.png'))
        plt.show()

In [None]:
class ModelVersionManager:
    def __init__(self, base_path='../models'):
        self.base_path = base_path
        self.experiments_path = os.path.join(base_path, 'experiments')
        self.production_path = os.path.join(base_path, 'production')
        
        os.makedirs(self.experiments_path, exist_ok=True)
        os.makedirs(self.production_path, exist_ok=True)
    
    def save_experiment(self, trainer, metrics, hyperparameters, experiment_name=None):
        """Salva experimento com timestamp"""
        if experiment_name is None:
            experiment_name = f"experiment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        
        experiment_path = os.path.join(self.experiments_path, experiment_name)
        trainer.save_model(experiment_path, metrics, hyperparameters)
        trainer.plot_training_history(experiment_path)
        
        return experiment_path
    
    def promote_to_production(self, experiment_path, model_name='best_model'):
        """Promove modelo para produção se melhor que o atual"""
        production_model_path = os.path.join(self.production_path, model_name)
        
        # Carregar métricas do experimento
        with open(os.path.join(experiment_path, 'metadata.json'), 'r') as f:
            new_metadata = json.load(f)
        
        # Verificar se existe modelo em produção
        current_metadata_path = os.path.join(production_model_path, 'metadata.json')
        
        should_promote = True
        if os.path.exists(current_metadata_path):
            with open(current_metadata_path, 'r') as f:
                current_metadata = json.load(f)
            
            # Comparar F1-score (ou outra métrica principal)
            current_f1 = current_metadata['metrics']['f1']
            new_f1 = new_metadata['metrics']['f1']
            
            if new_f1 <= current_f1:
                should_promote = False
                print(f"Modelo atual (F1: {current_f1:.4f}) é melhor que o novo (F1: {new_f1:.4f})")
        
        if should_promote:
            # Copiar modelo para produção
            import shutil
            if os.path.exists(production_model_path):
                shutil.rmtree(production_model_path)
            shutil.copytree(experiment_path, production_model_path)
            
            print(f"Modelo promovido para produção! F1-Score: {new_metadata['metrics']['f1']:.4f}")
            return True
        
        return False
    
    def list_experiments(self):
        """Lista todos os experimentos"""
        experiments = []
        for exp_name in os.listdir(self.experiments_path):
            exp_path = os.path.join(self.experiments_path, exp_name)
            metadata_path = os.path.join(exp_path, 'metadata.json')
            
            if os.path.exists(metadata_path):
                with open(metadata_path, 'r') as f:
                    metadata = json.load(f)
                experiments.append({
                    'name': exp_name,
                    'timestamp': metadata['timestamp'],
                    'metrics': metadata['metrics']
                })
        
        return experiments

## Pipeline of Fine-tuning BERT for Sentiment Analysis

### Exploratory Data Analysis (EDA)

#### Data Catalogy

**Data Fields**


| Column	                | Description                                                                           |
|---------------------------|---------------------------------------------------------------------------------------|
| submission_date           | The date and time when the review was submitted. Format: "%Y-%m-%d %H:%M:%S".         |
| reviewer_id               | A unique identifier for the reviewer.                                                 |
| product_id	            | A unique identifier for the product being reviewed.                                   |
| product_name	            | The name of the product being reviewed.                                               |
| product_brand	            | The brand of the product being reviewed.                                              |
| site_category_lv1         | The highest level category for the product on the site where the review is submitted. |
| site_category_lv2	        | The second level category for the product on the site where the review is submitted.  |
| review_title	            | The title of the review.                                                              |
| **overall_rating**        | The overall star rating given by the reviewer on a scale of 1 to 5.                   |
| **recommend_to_a_friend**	| Whether or not the reviewer would recommend the product to a friend (Yes/No).         |
| **review_text**           | The full text of the review.                                                          |
| reviewer_birth_year	    | The birth year of the reviewer.                                                       |
| reviewer_gender	        | The gender of the reviewer (F/M).                                                     |
| reviewer_state	        | The Brazilian state of the reviewer (e.g., RJ).                                       |

In [None]:
# Extracting the dataset into a DataFrame for further analysis
dataset_df = pd.DataFrame(dataset['train'])

#### **Target:** Overall_rating vs Recommend_to_a_friend

In [None]:
# Gerar matriz de cruzamento (crosstab) entre 'overall_rating' e 'recommend_to_a_friend'
cross_tab = pd.crosstab(dataset_df['overall_rating'], dataset_df['recommend_to_a_friend'], normalize='index') * 100

plt.figure(figsize=(8, 6))
sns.heatmap(cross_tab, annot=True, fmt=".2f", cmap="Blues", cbar_kws={'format': '%.0f%%'})
plt.title('Matriz de Cruzamento: Nota vs. Recomendaria a um Amigo (%)')
plt.xlabel('Recomendaria a um Amigo')
plt.ylabel('Nota (overall_rating)')
plt.show()

In [None]:
cross_tab

In [None]:
x_no = dataset_df[dataset_df['recommend_to_a_friend'] == 'No']['overall_rating']
x_yes = dataset_df[dataset_df['recommend_to_a_friend'] == 'Yes']['overall_rating']

fig = go.Figure()
fig.add_trace(go.Histogram(x=x_no, name='Recomendaria a um Amigo: Não', marker_color='red'))
fig.add_trace(go.Histogram(x=x_yes, name='Recomendaria a um Amigo: Sim', marker_color='blue'))

# The two histograms are drawn on top of another
fig.update_layout(barmode='stack')
fig.show()

In [None]:
# Calcular o percentual de 'Yes' e 'No' na coluna 'recommend_to_a_friend'
percentual = dataset_df['recommend_to_a_friend'].value_counts(normalize=True) * 100
print(percentual)

In [None]:
# Agrupar valores de overall_rating como positivo (3 a 5) e negativo (1 a 2)
dataset_df['sentiment'] = dataset_df['overall_rating'].apply(lambda x: 'positivo' if x >= 3 else 'negativo')

# Calcular o percentual de positivos e negativos
percentual_sentiment = dataset_df['sentiment'].value_counts(normalize=True) * 100

print(percentual_sentiment)

#### Discovery MAX_LENGTH

In [None]:
dataset_df['review_text'] = dataset_df['review_text'].str.strip()
dataset_df['review_text_length'] = dataset_df['review_text'].apply(lambda x: len(x) if pd.notnull(x) else 0)

# Gerar o gráfico de distribuição com agrupamento de 100 em 100
plt.figure(figsize=(19, 8))
sns.histplot(dataset_df['review_text_length'], binwidth=50, kde=True, color='blue')
plt.title('Distribuição do Número de Caracteres por Review (Agrupamento de 50)')
plt.xlabel('Número de Caracteres')
plt.ylabel('Frequência')
plt.show()

In [None]:
# Gerar o gráfico de distribuição com agrupamento de 500 em 500 e frequência em percentil
plt.figure(figsize=(19, 8))

# Calcular a frequência relativa (percentil)
total_count = len(dataset_df['review_text_length'])
sns.histplot(
    dataset_df['review_text_length'], 
    binwidth=500, 
    kde=False, 
    color='blue', 
    stat='percent'
)

plt.title('Distribuição do Número de Caracteres por Review (Agrupamento de 500)')
plt.xlabel('Número de Caracteres')
plt.ylabel('Frequência (%)')
# Definir granularidade do eixo Y de 5 em 5%
plt.yticks(np.arange(0, 101, 5))

plt.show()

**Constatamos:** que ao observar o gráfico de distribuição, aproximadamente 95% dos registros estão com até 500 caracteres, o que ultrapassa estão dentro dos 5% restante. Desta forma, será interessante utilizar o MAX_LENGTH = 512, para que o contexto dos ~95% dos registros e os demais serão truncados.

### Data Preprocessing

In [None]:
# parametros para iniciar a classe do modelo BERT
MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'
NUM_LABELS = 2
MAX_LENGTH = 512

# Parametros de treinamento
BATCH_SIZE = 16  # Ajustado para 16 para evitar problemas de memória, apenas temos 8192 MiB de GPU
EPOCHS = 1
LEARNING_RATE = 2e-5
    
# Hiperparâmetros
hyperparameters = {
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS,
    'learning_rate': LEARNING_RATE
    }

In [None]:
# Inicializar trainer fazendo a carga do modelo BERT
trainer = BERTSentimentTrainer(model_name=MODEL_NAME, max_length=MAX_LENGTH, num_labels=NUM_LABELS
                               ,dataset_name=DATASET_HF)

# Inicializar o gerenciador de versões
version_manager = ModelVersionManager()

In [None]:
# Carregar e preparar dados
texts, labels = trainer.load_data()
train_dataset, val_dataset, test_dataset = trainer.prepare_data(texts, labels, seed_random_state=42)
    
print(f"Dados preparados:")
print(f"  Treino: {len(train_dataset)} amostras")
print(f"  Validação: {len(val_dataset)} amostras")
print(f"  Teste: {len(test_dataset)} amostras")

### Model Training

In [None]:
# Treinar modelo
trainer.train(train_dataset, val_dataset, BATCH_SIZE, EPOCHS, LEARNING_RATE)

### Model Evaluation

In [None]:
# Avaliar no conjunto de teste
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loss, test_accuracy, test_metrics = trainer.evaluate(test_loader)
    
print(f"\n=== Resultados Finais ===")
print(f"Loss de teste: {test_metrics['loss']:.4f}")
print(f"Acurácia: {test_metrics['accuracy']:.4f}")
print(f"Precisão: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"F1-Score: {test_metrics['f1']:.4f}")
print(f"AUC-ROC: {test_metrics['auc_roc']:.4f}")

In [None]:
# Salvar experimento
experiment_path = version_manager.save_experiment(trainer, test_metrics, hyperparameters)
    
# Tentar promover para produção
version_manager.promote_to_production(experiment_path)
    
print(f"\nExperimento salvo em: {experiment_path}")

In [None]:
# Close the writer in TensorBoard
writer.close()