# Treinamento de modelos de aprendizagem de máquina em classificação binária e multilabel usando o dataset ToLD-Br

## Imports e configurações iniciais

In [1]:
import re
import string
import unicodedata
import json

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/eliane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report, hamming_loss)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

In [3]:
SEED = 42
np.random.seed(SEED)

## Carregamento dos dados

In [4]:
dataset_path = "../ToLD-BR.csv"

df = pd.read_csv(dataset_path, encoding="utf-8")

In [5]:
df.head()

Unnamed: 0,text,homophobia,obscene,insult,racism,misogyny,xenophobia
0,Meu nivel de amizade com isis é ela ter meu in...,0.0,0.0,2.0,0.0,0.0,0.0
1,"rt @user @user o cara adultera dados, que fora...",0.0,0.0,1.0,0.0,0.0,0.0
2,@user @user @user o cara só é simplesmente o m...,0.0,2.0,1.0,0.0,0.0,0.0
3,eu to chorando vei vsf e eu nem staneio izone ...,0.0,1.0,0.0,0.0,0.0,0.0
4,Eleitor do Bolsonaro é tão ignorante q não per...,0.0,1.0,2.0,0.0,0.0,0.0


## Pré processamento de dados

Remoção de acentos, URLs, menções, hashtags, dígitos e pontuação

In [6]:
def remove_accentuation(text):
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join([c for c in nfkd if not unicodedata.combining(c)])

In [7]:
def clean_text(text, stopwords_set=None):
    if text is None:
        return ""
    text = text.lower()
    text = remove_accentuation(text)
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#\w+", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = text.split()
    if stopwords_set is not None:
        tokens = [w for w in tokens if w not in stopwords_set]
    return " ".join(tokens)

In [8]:
def preprocess(series, stopwords_set=None):
    return series.map(lambda x: clean_text(str(x), stopwords_set=stopwords_set))

## Tratamento e Rotulagem

In [9]:
stopwords_pt = set(stopwords.words("portuguese"))
df["text_clean"] = preprocess(df["text"], stopwords_set=stopwords_pt)

In [10]:
categories = ["homophobia","obscene","insult","racism","misogyny","xenophobia"]

In [11]:
vect = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)

### Binária

In [12]:
df_binary = df.copy()
df_binary["toxic_binary"] = (df_binary[categories].sum(axis=1) > 0).astype(int)

In [13]:
X_binary = df_binary["text_clean"].values
y_binary = df_binary["toxic_binary"].values

In [14]:
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_binary, y_binary, test_size=0.2, stratify=y_binary, random_state=SEED
)

In [15]:
X_train_bin_tfidf = vect.fit_transform(X_train_bin)
X_test_bin_tfidf  = vect.transform(X_test_bin)

### Multilabel

In [12]:
df_multil = df.copy()

In [13]:
y_multil = df_multil[categories].apply(lambda col: (col > 0).astype(int)).values
X_multil = df_multil["text_clean"].values

In [14]:
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(
    X_multil, y_multil, test_size=0.2, random_state=SEED
)

In [15]:
X_train_ml_tfidf = vect.fit_transform(X_train_ml)
X_test_ml_tfidf  = vect.transform(X_test_ml)

## Execução e Avaliação

In [16]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=200, random_state=SEED, class_weight="balanced"),
    "NaiveBayes":        MultinomialNB(),
    "DecisionTree":      DecisionTreeClassifier(random_state=SEED, class_weight="balanced"),
    "RandomForest":      RandomForestClassifier(n_estimators=100, random_state=SEED, class_weight="balanced", n_jobs=-1),
    "KNN":               KNeighborsClassifier(n_neighbors=5)
}

In [17]:
estimators = [(name, model) for name, model in models.items()]

In [18]:
def save_results_json(execution_type, results):
  with open(f"./results_{execution_type}.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)
  print(f"Resultados de classificação {execution_type} salvos em results_{execution_type}.json")

### Binária

In [23]:
def test_results_binary(name, y_test, y_pred, results):
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    cm   = confusion_matrix(y_test, y_pred).tolist()
    print(f"{name} — acc:{acc:.4f}, prec:{prec:.4f}, rec:{rec:.4f}, f1:{f1:.4f}")
    print(classification_report(y_test, y_pred))
    results[name] = {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "confusion_matrix": cm
    }

In [24]:
def run_models_binary(results, X_train, y_train, X_test, y_test):
  for name, model in models.items():
    print(f"[Binária] Treinando modelo: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_results_binary(name, y_test, y_pred, results)

### Multilabel

In [19]:
def test_results_multilabel(name, y_test, y_pred, results):
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1   = f1_score(y_test, y_pred, average="macro", zero_division=0)
    cm   = [confusion_matrix(y_test[:,i], y_pred[:,i]).tolist() for i in range(y_test.shape[1])]
    hm   = hamming_loss(y_test, y_pred)
    print(f"{name} multilabel — acc:{acc:.4f}, prec:{prec:.4f}, rec:{rec:.4f}, f1:{f1:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    results[name] = {
        "accuracy": acc,
        "precision_macro": prec,
        "recall_macro": rec,
        "f1_macro": f1,
        "hamming_loss": hm,
        "confusion_matrix_per_class": cm
    }

In [20]:
def run_models_multilabel(results, X_train, y_train, X_test, y_test):
  for name, base_model in models.items():
    print(f"[Multilabel] Treinando modelo: {name}")
    ml_model = OneVsRestClassifier(base_model)
    ml_model.fit(X_train, y_train)
    y_pred_ml = ml_model.predict(X_test)
    test_results_multilabel(name, y_test, y_pred_ml, results)

### Ensemble Voting

In [21]:
def ensemble_voting(execution_type, results, X_train, y_train, X_test, y_test):
  print(f"[{execution_type}] Treinando Voting Ensemble")
  voting = OneVsRestClassifier(VotingClassifier(estimators=estimators, voting="soft", n_jobs=-1)) if execution_type == "Multilabel" else VotingClassifier(estimators=estimators, voting="soft", n_jobs=-1)
  voting.fit(X_train, y_train)
  y_pred_v = voting.predict(X_test)
  if execution_type == "Multilabel":
    test_results_multilabel("VotingEnsemble", y_test, y_pred_v, results)
  else:
    test_results_binary("VotingEnsemble", y_test, y_pred_v, results)

## Treinamentos

### Binário

In [28]:
results_bin = {}

run_models_binary(results_bin, X_train_bin_tfidf, y_train_bin, X_test_bin_tfidf, y_test_bin)
ensemble_voting("Binaria", results_bin, X_train_bin_tfidf, y_train_bin, X_test_bin_tfidf, y_test_bin)

save_results_json("binaria", results_bin)

[Binária] Treinando modelo: LogisticRegression
LogisticRegression — acc:0.7426, prec:0.7065, rec:0.7115, f1:0.7090
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      2349
           1       0.71      0.71      0.71      1851

    accuracy                           0.74      4200
   macro avg       0.74      0.74      0.74      4200
weighted avg       0.74      0.74      0.74      4200

[Binária] Treinando modelo: NaiveBayes
NaiveBayes — acc:0.7086, prec:0.7209, rec:0.5527, f1:0.6257
              precision    recall  f1-score   support

           0       0.70      0.83      0.76      2349
           1       0.72      0.55      0.63      1851

    accuracy                           0.71      4200
   macro avg       0.71      0.69      0.69      4200
weighted avg       0.71      0.71      0.70      4200

[Binária] Treinando modelo: DecisionTree
DecisionTree — acc:0.7045, prec:0.6486, rec:0.7191, f1:0.6820
              precision    re

### Multilabel

In [22]:
results_ml = {}

run_models_multilabel(results_ml, X_train_ml_tfidf, y_train_ml, X_test_ml_tfidf, y_test_ml)
ensemble_voting("Multilabel", results_ml, X_train_ml_tfidf, y_train_ml, X_test_ml_tfidf, y_test_ml)

save_results_json("multilabel", results_ml)

[Multilabel] Treinando modelo: LogisticRegression
LogisticRegression multilabel — acc:0.5948, prec:0.3648, rec:0.6020, f1:0.4463
              precision    recall  f1-score   support

           0       0.39      0.74      0.51        53
           1       0.64      0.76      0.69      1304
           2       0.52      0.69      0.59       857
           3       0.18      0.50      0.27        22
           4       0.25      0.47      0.33        97
           5       0.21      0.45      0.29        31

   micro avg       0.54      0.72      0.62      2364
   macro avg       0.36      0.60      0.45      2364
weighted avg       0.56      0.72      0.63      2364
 samples avg       0.30      0.32      0.30      2364

[Multilabel] Treinando modelo: NaiveBayes
NaiveBayes multilabel — acc:0.6262, prec:0.4330, rec:0.0944, f1:0.1403
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        53
           1       0.76      0.35      0.48      130