In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

import spacy
import re
import numpy as np
import matplotlib.pyplot as plt
import emoji

from unicodedata import normalize
from tqdm import tqdm
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from datasets import list_datasets, load_dataset

In [61]:
def preprocessar_tweets(tweets):
    repetion_pattern = re.compile(r'(.)\1\1+')
    new_tweets = []
    with tqdm(total=len(tweets), colour='green', desc='Processando') as pbar:
      for tweet in tweets:
          tweet = emoji.demojize(tweet, language='pt')
          tweet = tweet.replace('_', ' ')
          tweet = normalize('NFKD', tweet).encode('ASCII', 'ignore').decode('ASCII')
          tweet = repetion_pattern.sub(r'\1', tweet)
          tweet = re.sub(r'https?://\w+', '', tweet)
          tweet = re.sub(r'@\w+', ' ', tweet)
          tweet = re.sub(r'\s\s+', ' ', tweet)
          new_tweets.append(tweet.strip())
          pbar.update(1)
    return new_tweets

In [63]:
# Carregar os dados
data = pd.read_csv('./dataset-merged/dataset_merged.csv')  # Seu arquivo de dados

# Dividir os dados em treinamento, teste
train_texts, test_texts, train_labels, test_labels = train_test_split(preprocessar_tweets(data['tweet']), data['hatespeech'],
                                                                      train_size=0.8, random_state=42)

Processando: 100%|[32m████████████████████████████████████████████████████████████[0m| 12622/12622 [00:01<00:00, 10104.27it/s][0m


In [74]:
# Carregar o modelo BERT pré-treinado
model = SentenceTransformer('neuralmind/bert-base-portuguese-cased')
# model = SentenceTransformer('adalbertojunior/distilbert-portuguese-cased')
# model = SentenceTransformer('pablocosta/bertabaporu-base-uncased')


# Obter embeddings dos textos de treinamento
train_embeddings = model.encode(train_texts)
test_embeddings = model.encode(test_texts)

No sentence-transformers model found with name C:\Users\intel/.cache\torch\sentence_transformers\neuralmind_bert-base-portuguese-cased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\intel/.cache\torch\sentence_transformers\neuralmind_bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initial

In [75]:
classifiers = {
    'Logistic Regression': LogisticRegression(class_weight=None, max_iter=1500),
    'Categorical NB': CategoricalNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(class_weight=None),
    'XGBClassifier': XGBClassifier(n_estimators=100),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight=None)
}

In [76]:
print('\nEvaluation\n')

for clf_name, clf in classifiers.items():

    print(f'\n  Classifier: {clf_name}\n')
    
    if clf_name != 'Categorical NB':
        clf.fit(train_embeddings, train_labels)  
        pred = clf.predict(test_embeddings)
    else:
        train_features = np.mean(train_embeddings, axis=1)
        train_features = pd.qcut(train_features, q=10, labels=False, duplicates='drop')
        train_features = train_features.reshape(-1, 1)
        test_features = np.mean(test_embeddings, axis=1)
        test_features = pd.qcut(test_features, q=10, labels=False, duplicates='drop')
        test_features = test_features.reshape(-1, 1)
        
        clf.fit(train_features, train_labels)  
        pred = clf.predict(test_features)
        
        
    report = classification_report(test_labels, pred, zero_division=0)
    
    print(report)
    
    # Calcula a pontuação F1-média
    f1_macro = f1_score(test_labels, pred, average='macro')
    f1_micro = f1_score(test_labels, pred, average='micro')
    f1_weighted = f1_score(test_labels, pred, average='weighted')

    print("F1-macro: {:.4f}".format(f1_macro))
    print("F1-micro: {:.4f}".format(f1_micro))
    print("F1-weighted: {:.4f}".format(f1_weighted))
    #ConfusionMatrixDisplay.from_predictions(test_labels, pred)
    #plt.show()


Evaluation


  Classifier: Logistic Regression

              precision    recall  f1-score   support

           0       0.76      0.71      0.74       988
           1       0.82      0.86      0.84      1537

    accuracy                           0.80      2525
   macro avg       0.79      0.79      0.79      2525
weighted avg       0.80      0.80      0.80      2525

F1-macro: 0.7890
F1-micro: 0.8016
F1-weighted: 0.8002

  Classifier: Categorical NB

              precision    recall  f1-score   support

           0       0.50      0.13      0.20       988
           1       0.62      0.92      0.74      1537

    accuracy                           0.61      2525
   macro avg       0.56      0.52      0.47      2525
weighted avg       0.57      0.61      0.53      2525

F1-macro: 0.4709
F1-micro: 0.6079
F1-weighted: 0.5294

  Classifier: KNN

              precision    recall  f1-score   support

           0       0.75      0.56      0.64       988
           1       0.75      