<a href="https://colab.research.google.com/github/codeonthespectrum/Curupira.ia/blob/main/curupira.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install -q transformers datasets accelerate sentencepiece huggingface_hub nlpaug

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m327.7/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━

In [12]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.nn import BCEWithLogitsLoss
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
import nlpaug.augmenter.word as naw
import nltk
import random
from nltk.corpus import wordnet

In [63]:
ds = load_dataset("ruanchaves/hatebr")

In [64]:
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [120]:
def augment_text(text, num_augmentations=2):
    words = text.split()
    new_texts = []

    for _ in range(num_augmentations):
        new_words = words[:]
        for i in range(len(words)):
            synonyms = wordnet.synsets(words[i], lang="por")
            if synonyms:
                lemma_names = [lemma.name() for lemma in synonyms[0].lemmas() if lemma.name() is not None]
                if lemma_names:
                    chosen_synonym = random.choice(lemma_names)
                    new_words[i] = chosen_synonym
        new_texts.append(" ".join(new_words))

    return new_texts

In [121]:
def augment_dataset(examples):
    augmented_examples = {"instagram_comments": [], "offensive_language": []}

    for text, label in zip(examples["instagram_comments"], examples["offensive_language"]):
        new_samples = augment_text(text)
        augmented_examples["instagram_comments"].extend(new_samples)
        augmented_examples["offensive_language"].extend([label] * len(new_samples))


    augmented_examples["instagram_comments"].extend(examples["instagram_comments"])
    augmented_examples["offensive_language"].extend(examples["offensive_language"])

    return augmented_examples

In [122]:
ds_split = ds['train'].train_test_split(test_size=0.2)

In [123]:
augmented_train = ds_split['train'].map(augment_dataset, batched=True, remove_columns=ds_split['train'].column_names)
ds_split['train'] = concatenate_datasets([ds_split['train'], augmented_train])

Map:   0%|          | 0/3584 [00:00<?, ? examples/s]

In [124]:
model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.config.hidden_dropout_prob = 0.3
model.config.attention_probs_dropout_prob = 0.3
model.config.problem_type = "binary_classification"

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [139]:
def tokenize_function(examples):
    return tokenizer(
        examples["instagram_comments"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

# Tokenizar os dados
tokens_ds = ds_split.map(tokenize_function, batched=True)
tokens_ds = tokens_ds.rename_column("offensive_language", "labels")
tokens_ds = tokens_ds.map(lambda x: {"labels": float(x["labels"][0]) if isinstance(x["labels"], list) else float(x["labels"])})

Map:   0%|          | 0/14336 [00:00<?, ? examples/s]

In [135]:
columns_to_remove = [col for col in tokens_ds['train'].column_names if col not in ['input_ids', 'attention_mask', 'labels']]
tokens_ds = tokens_ds.remove_columns(columns_to_remove)

In [127]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        if labels is not None:
            if isinstance(labels, torch.Tensor):
                labels = labels.detach().view(-1)
            else:
                labels = torch.tensor(labels).view(-1)

        outputs = model(**inputs)
        logits = outputs.logits.view(-1)


        class_weights = torch.tensor([1.0, 3.0], device=logits.device)
        loss_fct = BCEWithLogitsLoss(pos_weight=class_weights)
        loss = loss_fct(logits, labels.float())

        return (loss, outputs) if return_outputs else loss


In [136]:
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions)).numpy()
    preds = (preds > 0.5).astype(int)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

In [137]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    label_names=["labels"],
    logging_steps=100,
    learning_rate=3e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    remove_unused_columns=False,

)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_split['train'],
    eval_dataset=ds_split['test'],
    compute_metrics=compute_metrics,


)

In [140]:
trainer.train()

RuntimeError: Could not infer dtype of NoneType

In [141]:
from sklearn.metrics import confusion_matrix

# Get predictions using the trainer object
predictions = trainer.predict(ds_split['test'])
# Access predicted labels and ground truth labels
y_pred = predictions.predictions.argmax(-1)
y_test = predictions.label_ids

# Compute and print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


TypeError: BertForSequenceClassification.forward() got an unexpected keyword argument 'offensive_language'