In [None]:
pip install transformers torch datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Defina os hiperparâmetros otimizados
learning_rate = 3.255788747459486e-05
weight_decay = 0.031031065174245122
adam_beta1 = 0.8445637934160373
adam_beta2 = 0.8338816842140165
adam_epsilon = 2.527092625455385e-08
label_smoothing_factor = 0.07158711257743958
early_stopping_patience = 2

# Defina os hiperparâmetros estáticos
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
output_dir = "./model"
#batch_size = 1
#num_train_epochs = 1

batch_size = 8
num_train_epochs = 30


# Carregue o tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Carregue o conjunto de dados e divida-o em treinamento e avaliação
dataset = load_dataset("dougtrajano/olid-br")
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Converta a coluna 'is_offensive' em rótulos numéricos e adicione a coluna 'label'
def convert_labels(example):
    example['label'] = 1 if example['is_offensive'] else 0
    return example

train_dataset = train_dataset.map(convert_labels)
eval_dataset = eval_dataset.map(convert_labels)

# Tokenize os conjuntos de dados
train_dataset = train_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length'), batched=True)
eval_dataset = eval_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length'), batched=True)

class BertForSequenceClassificationWithWeightedLoss(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        class_weights = [1 - (train_dataset["label"].count(0) / len(train_dataset)), 1 - (train_dataset["label"].count(1) / len(train_dataset))]
        self.loss_fct = torch.nn.CrossEntropyLoss(weight=torch.Tensor(class_weights).to(device))

model = BertForSequenceClassificationWithWeightedLoss.from_pretrained(model_checkpoint, num_labels=2)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none",  # Silencia os logs do Hugging Face Hub
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_beta1=adam_beta1,
    adam_beta2=adam_beta2,
    adam_epsilon=adam_epsilon,
    label_smoothing_factor=label_smoothing_factor,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(logits), dim=-1)
    correct = (predictions.to(torch.int) == labels).sum().item()
    total = len(labels)
    accuracy = correct / total
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Inicie o treinamento
trainer.train()

# Salve o modelo
trainer.save_model(output_dir)

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Avalie o modelo no conjunto de teste
eval_output = trainer.predict(eval_dataset)

# Obtenha as previsões e os rótulos verdadeiros
predictions = torch.argmax(torch.tensor(eval_output.predictions), dim=-1).numpy()
true_labels = eval_output.label_ids

# Calcule as métricas
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average="weighted")
recall = recall_score(true_labels, predictions, average="weighted")
f1 = f1_score(true_labels, predictions, average="weighted")

print(f"Acurácia: {accuracy * 100:.2f}%")
print(f"Precisão (ponderada): {precision * 100:.2f}%")
print(f"Abrangência (ponderada): {recall * 100:.2f}%")
print(f"F-Measure (ponderada): {f1 * 100:.2f}%")

# Gere o relatório de classificação para cada classe
class_report = classification_report(true_labels, predictions, target_names=["NOT (non-offensive)", "OFF (offensive)"], digits=2)
print("\nRelatório de classificação:")
print(class_report)